diff --git a/icu4j/src/com/ibm/icu/text/CollationElementIterator.java b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java index 0ac52f78ed7..23a856708b5 100755 --- a/icu4j/src/com/ibm/icu/text/CollationElementIterator.java +++ b/icu4j/src/com/ibm/icu/text/CollationElementIterator.java @@ -6,53 +6,77 @@ import com.ibm.icu.impl.NormalizerImpl; import com.ibm.icu.impl.UCharacterProperty; /** - *
The CollationElementIterator
class is used as an iterator
- * to walk through each character of an international string. Use the iterator
- * to return the ordering priority of the positioned character. The ordering
- * priority of a character, which we refer to as a key, defines how a
- * character is collated in the given collation object.
For example, consider the following in Spanish: + *
+ * The CollationElementIterator
object is an iterator created
+ * by a RuleBasedCollator to walk through an international string. The return
+ * result of each iteration is a 32 bit collation element that defines the
+ * ordering priority of the next sequence of characters in the source string.
+ *
For better illustration, consider the following in Spanish: *
** And in German, *- * "ca" -> the first key is key('c') and second key is key('a'). - * "cha" -> the first key is key('ch') and second key is key('a'). + * "ca" -> the first collation element is collation_element('c') and second + * collation element is collation_element('a'). + * + * Since "ch" in Spanish sorts as one entity, the below example returns one + * collation element for the 2 characters 'c' and 'h' + * + * "cha" -> the first collation element is collation_element('ch') and second + * collation element is collation_element('a'). **
** - *- * "\u00e4b"-> the first key is key('a'), the second key is key('e'), and - * the third key is key('b'). + * Since the character 'æ' is a composed character of 'a' and 'e', the + * below example returns 2 collation elements for the single character + * 'æ' + * + * "æb" -> the first collation element is collation_element('a'), the + * second collation element is collation_element('e'), and the + * third collation element is collation_element('b'). **
The key of a character is an integer composed of primary order(short),
- * secondary order(byte), and tertiary order(byte). Java strictly defines
- * the size and signedness of its primitive data types. Therefore, the static
- * functions primaryOrder
, secondaryOrder
, and
- * tertiaryOrder
return int
, short
,
- * and short
respectively to ensure the correctness of the key
- * value.
- * Example of the iterator usage, + * For collation ordering comparison, the collation element results can not be + * compared simply by using basic arithmetric operators, e.g. <, == or >, + * further processing has to be done. Details can be found in the ICU + * + * user guide. An example of using the CollationElementIterator for + * collation ordering comparison is the class + * com.ibm.icu.text.StringSearch. + *
+ *+ * To construct a CollationElementIterator object, users would have to call the + * factory method getCollationElementIterator() in a RuleBasedCollator object + * that defines the sorting order that is desired. + *
+ *+ * Example: *
** ** String testString = "This is a test"; - * RuleBasedCollator ruleBasedCollator = (RuleBasedCollator)Collator.getInstance(); - * CollationElementIterator collationElementIterator = ruleBasedCollator.getCollationElementIterator(testString); - * int primaryOrder = CollationElementIterator.primaryOrder(collationElementIterator.next()); + * RuleBasedCollator rbc = new RuleBasedCollator("&a<b"); + * CollationElementIterator collationElementIterator = rbc.getCollationElementIterator(testString); + * int primaryOrder = CollationElementIterator.IGNORABLE; + * while (primaryOrder != CollationElementIterator.NULLORDER) { + * int order = collationElementIterator.next(); + * if (order != CollationElementIterator.IGNORABLE && + * order != CollationElementIterator.NULLORDER) { + * // order is valid, not ignorable and we have not passed the end + * // of the iteration, we do something + * primaryOrder = CollationElementIterator.primaryOrder(order); + * System.out.println("Next primary order 0x" + Integer.toHexString(primaryOrder)); + * } + * } **
- * CollationElementIterator.next
returns the collation order
- * of the next character. A collation order consists of primary order,
- * secondary order and tertiary order. The data type of the collation
- * order is int. The first 16 bits of a collation order
- * is its primary order; the next 8 bits is the secondary order and the
- * last 8 bits is the tertiary order.
This constant is returned by the iterator in the methods next() and + * previous() when the end or the beginning of the source string has been + * reached, and there are no more valid collation elements to return.
+ *See class documentation for an example of use.
* @draft 2.2 + * @see #next + * @see #previous */ public final static int NULLORDER = 0xffffffff; /** - * Ignorable collation element order. + *This constant is returned by the iterator in the methods next() and + * previous() when a collation element result is to be ignored.
+ *See class documentation for an example of use.
+ * @draft 2.2 + * @see #next + * @see #previous */ public static final int IGNORABLE = 0; @@ -76,24 +110,25 @@ public final class CollationElementIterator // public getters ------------------------------------------------------- /** - *Returns the character offset in the original text corresponding to - * the next collation element. (That is, getOffset() returns the position - * in the text corresponding to the collation element that will be - * returned by the next call to next().) This value could be either + *
Returns the character offset in the source string corresponding to + * the next collation element. i.e. getOffset() returns the position + * in source string corresponding to the collation element that will be + * returned by the next call to next(). This value could be either *
setOffset(offset)
* sets the index in the middle of a contraction, getOffset()
* returns the index of the first character in the contraction, which
- * may not be equals to offset.
- * getOffset()
may return the
+ * may not be equals to the original offset that was set. Hence calling
+ * getOffset() immediately after setOffset(offset) does not guarantee that
+ * the original offset set will be returned.
+ * getOffset()
may return the
* index of the immediate subsequent character, or composite
* character with the first character, having a combining class of 0.
+ * Note calling getOffset() immediately after setOffset(offset) may not - * return the value offset.
- * @return The character offset in the original text corresponding to the + * @return The character offset in the source string corresponding to the * collation element that will be returned by the next call to * next(). * @draft 2.2 @@ -111,8 +146,11 @@ public final class CollationElementIterator /** - * Return the maximum length of any expansion sequences that end with the - * specified collation element. + *+ * Returns the maximum length of any expansion sequence that ends with + * the argument collation element ce. If there is no expansion with the + * argument ce as the last element, 1 is returned. + *
* @param ce a collation element returned by previous() or next(). * @return the maximum length of any expansion sequences ending * with the specified collation element. @@ -122,9 +160,11 @@ public final class CollationElementIterator { int start = 0; int limit = m_collator_.m_expansionEndCE_.length; + long unsignedce = ce & 0xFFFFFFFFl; while (start < limit - 1) { int mid = start + ((limit - start) >> 1); - if (ce <= m_collator_.m_expansionEndCE_[mid]) { + long midce = m_collator_.m_expansionEndCE_[mid] & 0xFFFFFFFFl; + if (unsignedce <= midce) { limit = mid; } else { @@ -135,7 +175,8 @@ public final class CollationElementIterator if (m_collator_.m_expansionEndCE_[start] == ce) { result = m_collator_.m_expansionEndCEMaxSize_[start]; } - else if (m_collator_.m_expansionEndCE_[limit] == ce) { + else if (limit < m_collator_.m_expansionEndCE_.length && + m_collator_.m_expansionEndCE_[limit] == ce) { result = m_collator_.m_expansionEndCEMaxSize_[limit]; } else if ((ce & 0xFFFF) == 0x00C0) { @@ -147,34 +188,49 @@ public final class CollationElementIterator // public other methods ------------------------------------------------- /** - *Resets the cursor to the beginning of the string. The next call - * to next() will return the first collation element in the string.
+ *+ * Resets the cursor to the beginning of the string. The next call + * to next() and previous will return the first and last collation element + * in the string respectively. + *
+ *+ * If the RuleBasedCollator used in this iterator has its attributes + * changed, calling reset() will reinitialize the iterator to use the new + * RuleBasedCollator attributes. + *
* @draft 2.2 */ - public synchronized void reset() + public void reset() { - m_source_.setIndex(0); + m_source_.setIndex(m_source_.getBeginIndex()); updateInternalState(); } /** - *Get the next collation element in the string.
- *This iterator iterates over a sequence of collation elements that - * were built from the string. Because there isn't necessarily a - * one-to-one mapping from characters to collation elements, this doesn't - * mean the same thing as "return the collation element [or ordering - * priority] of the next character in the string".
- *This function returns the collation element that the iterator is + *
+ * Get the next collation element in the source string. + *
+ *+ * This iterator iterates over a sequence of collation elements that were + * built from the string. Because there isn't necessarily a one-to-one + * mapping from characters to collation elements, this doesn't mean the + * same thing as "return the collation element [or ordering priority] of + * the next character in the string". + *
+ *+ * This function returns the collation element that the iterator is * currently pointing to and then updates the internal pointer to point to * the next element. previous() updates the pointer first and then * returns the element. This means that when you change direction while * iterating (i.e., call next() and then call previous(), or call * previous() and then call next()), you'll get back the same element - * twice.
- * @return the next collation element + * twice. + * + * @return the next collation element or NULLORDER if the end of the + * iteration has been reached. * @draft 2.2 */ - public synchronized int next() + public int next() { m_isForwards_ = true; if (m_CEBufferSize_ > 0) { @@ -230,24 +286,30 @@ public final class CollationElementIterator } /** - *Get the previous collation element in the string.
- *This iterator iterates over a sequence of collation elements that + *
+ * Get the previous collation element in the source string. + *
+ *+ * This iterator iterates over a sequence of collation elements that * were built from the string. Because there isn't necessarily a * one-to-one mapping from characters to collation elements, this doesn't * mean the same thing as "return the collation element [or ordering - * priority] of the previous character in the string".
- *This function updates the iterator's internal pointer to point to + * priority] of the previous character in the string". + *
+ *+ * This function updates the iterator's internal pointer to point to * the collation element preceding the one it's currently pointing to and * then returns that element, while next() returns the current element and * then updates the pointer. This means that when you change direction * while iterating (i.e., call next() and then call previous(), or call * previous() and then call next()), you'll get back the same element - * twice.
+ * twice. + * * @return the previous collation element, or NULLORDER when the start of - * the iteration has been reached. + * the iteration has been reached. * @draft 2.2 */ - public synchronized int previous() + public int previous() { if (m_source_.getIndex() <= 0 && m_isForwards_) { // if iterator is new or reset, we can immediate perform backwards @@ -317,50 +379,66 @@ public final class CollationElementIterator } /** - * Return the primary strength of a collation element. + * Return the primary order of a collation element ce. + * i.e. the first 16 bits of the argument ce. * @param ce the collation element - * @return the element's primary strength + * @return the element's 16 bits primary order. * @draft 2.2 */ public final static int primaryOrder(int ce) { - return (ce & RuleBasedCollator.CE_PRIMARY_MASK_) >> CE_PRIMARY_SHIFT_; + return (ce & RuleBasedCollator.CE_PRIMARY_MASK_) + >>> RuleBasedCollator.CE_PRIMARY_SHIFT_; } /** - * Return the secondary strength of a collation element. + * Return the secondary order of a collation element ce. + * i.e. the 16th to 27th inclusive set of bits in the argument ce. * @param ce the collation element - * @return the element's secondary strength + * @return the element's 8 bits secondary order * @draft 2.2 */ - public final static short secondaryOrder(int ce) + public final static int secondaryOrder(int ce) { - return (short)((ce & RuleBasedCollator.CE_SECONDARY_MASK_) - >> CE_SECONDARY_SHIFT_); + return (ce & RuleBasedCollator.CE_SECONDARY_MASK_) + >> RuleBasedCollator.CE_SECONDARY_SHIFT_; } /** - * Return the tertiary strength of a collation element. - * @param colelem the collation element - * @return the element's tertiary strength + * Return the tertiary order of a collation element ce. i.e. the last + * 8 bits in the argument ce. + * @param ce the collation element + * @return the element's 8 bits tertiary order * @draft 2.2 */ - public final static short tertiaryOrder(int ce) + public final static int tertiaryOrder(int ce) { - return (short)(ce & RuleBasedCollator.CE_TERTIARY_MASK_); + return ce & RuleBasedCollator.CE_TERTIARY_MASK_; } /** - *Sets the iterator to point to the collation element corresponding to - * the specified character (the parameter is a CHARACTER offset in the - * original string, not an offset into its corresponding sequence of - * collation elements). The value returned by the next call to next() - * will be the collation element corresponding to the specified position - * in the text. If that position is in the middle of a contracting - * character sequence, the result of the next call to next() is the - * collation element for that sequence. This means that getOffset() - * is not guaranteed to return the same value as was passed to a preceding - * call to setOffset().
- * @param offset new character offset into the original text to set. + *+ * Sets the iterator to point to the collation element corresponding to + * the specified character argument offset. The value returned by the next + * call to next() will be the collation element corresponding to the + * characters at argument offset. + *
+ *+ * If argument offset is in the middle of a contracting character sequence, + * the iterator is adjusted to the start of the contracting sequence. This + * means that getOffset() is not guaranteed to return the same value as + * the argument offset. + *
+ *+ * If the decomposition mode is on and argument offset is in the middle of + * a decomposible range of source text, the iterator may not render a + * correct result for + * the next forwards or backwards iteration. User has to ensure that the + * argument offset does not violate the mid of a decomposible range in + * source text. + *
+ * @param offset character offset into the original source string to + * set. Note this argument is not an offset into the corresponding + * sequence of collation elements * @draft 2.2 */ public void setOffset(int offset) @@ -388,7 +466,7 @@ public final class CollationElementIterator } updateInternalState(); int prevoffset = 0; - while (m_source_.getIndex() < offset) { + while (m_source_.getIndex() <= offset) { prevoffset = m_source_.getIndex(); next(); } @@ -399,59 +477,36 @@ public final class CollationElementIterator } /** - *Set a new string over which to iterate.
- *Iteration will start from the start of source.
- * @param source the new source text. + *+ * Sets a new source string for iteration and restart the iteration to + * start from the beginning of the argument source. + *
+ * @param source the new source string for iteration. * @draft 2.2 */ - public synchronized void setText(String source) + public void setText(String source) { m_source_ = new StringCharacterIterator(source); updateInternalState(); } /** - *Set a new string iterator over which to iterate.
- *Iteration will start from the start of source.
- * @param source the new source text. + *+ * Sets a new source string iterator for iteration and restart the + * iteration to start from the beginning of the argument source. + *
+ * @param source the new source string iterator for iteration. * @draft 2.2 */ - public synchronized void setText(CharacterIterator source) + public void setText(CharacterIterator source) { m_source_ = source; - m_source_.setIndex(0); + m_source_.setIndex(m_source_.getBeginIndex()); updateInternalState(); } // public miscellaneous methods ----------------------------------------- - // protected data members ----------------------------------------------- - - /** - * true if current codepoint was Hiragana - */ - protected boolean m_isCodePointHiragana_; - /** - * Position in the original string that starts with a non-FCD sequence - */ - protected int m_FCDStart_; - /** - * This is the CE from CEs buffer that should be returned. - * Initial value is 0. - * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_, - * backwards will end with m_CEBufferOffset_ == 0. - * The next/previous after we reach the end/beginning of the m_CEBuffer_ - * will cause this value to be reset to 0. - */ - protected int m_CEBufferOffset_; - /** - * This is the position to which we have stored processed CEs. - * Initial value is 0. - * The next/previous after we reach the end/beginning of the m_CEBuffer_ - * will cause this value to be reset to 0. - */ - protected int m_CEBufferSize_; - // protected constructors ----------------------------------------------- /** @@ -493,29 +548,95 @@ public final class CollationElementIterator updateInternalState(); } - // protected methods ---------------------------------------------------- + // package private data members ----------------------------------------- + + /** + * true if current codepoint was Hiragana + */ + boolean m_isCodePointHiragana_; + /** + * Position in the original string that starts with a non-FCD sequence + */ + int m_FCDStart_; + /** + * This is the CE from CEs buffer that should be returned. + * Initial value is 0. + * Forwards iteration will end with m_CEBufferOffset_ == m_CEBufferSize_, + * backwards will end with m_CEBufferOffset_ == 0. + * The next/previous after we reach the end/beginning of the m_CEBuffer_ + * will cause this value to be reset to 0. + */ + int m_CEBufferOffset_; + /** + * This is the position to which we have stored processed CEs. + * Initial value is 0. + * The next/previous after we reach the end/beginning of the m_CEBuffer_ + * will cause this value to be reset to 0. + */ + int m_CEBufferSize_; - /** - * Checks if iterator is in the buffer zone - * @return true if iterator is in buffer zone, false otherwise - */ - protected boolean isInBuffer() - { - return m_bufferOffset_ != -1; - } + // package private methods ---------------------------------------------- /** * Sets the collator used. * Internal use, all data members will be reset to the default values * @param collator to set */ - protected void setCollator(RuleBasedCollator collator) + void setCollator(RuleBasedCollator collator) { m_collator_ = collator; updateInternalState(); } - // private data members ------------------------------------------------- + /** + *Sets the iterator to point to the collation element corresponding to + * the specified character (the parameter is a CHARACTER offset in the + * original string, not an offset into its corresponding sequence of + * collation elements). The value returned by the next call to next() + * will be the collation element corresponding to the specified position + * in the text. Unlike the public method setOffset(int), this method does + * not try to readjust the offset to the start of a contracting sequence. + * getOffset() is guaranteed to return the same value as was passed to a + * preceding call to setOffset().
+ * @param offset new character offset into the original text to set. + * @draft 2.2 + */ + void setExactOffset(int offset) + { + m_source_.setIndex(offset); + updateInternalState(); + } + + /** + * Checks if iterator is in the buffer zone + * @return true if iterator is in buffer zone, false otherwise + */ + boolean isInBuffer() + { + return m_bufferOffset_ != -1; + } + + /** + * Determine if a character is a Thai vowel, which sorts after its base + * consonant. + * @param ch character to test + * @return true if ch is a Thai prevowel, false otherwise + */ + static final boolean isThaiPreVowel(char ch) + { + return (ch >= 0xe40 && ch <= 0xe44) || (ch >= 0xec0 && ch <= 0xec4); + } + + /** + * Determine if a character is a Thai base consonant, which sorts before + * its prevowel + * @param ch character to test + * @return true if ch is a Thai base consonant, false otherwise + */ + static final boolean isThaiBaseConsonant(char ch) + { + return ch >= 0xe01 && ch <= 0xe2e; + } // private inner class -------------------------------------------------- @@ -675,8 +796,6 @@ public final class CollationElementIterator private static final int CE_LONG_PRIMARY_TAG_ = 12; private static final int CE_CE_TAGS_COUNT = 13; private static final int CE_BYTE_COMMON_ = 0x05; - private static final int CE_PRIMARY_SHIFT_ = 16; - private static final int CE_SECONDARY_SHIFT_ = 8; // end special ce values and tags --------------------------------------- @@ -773,21 +892,19 @@ public final class CollationElementIterator * Source offsets points to the current processing character. * */ - private void normalize() + private void normalize() { - /* synwee todo normalize to 1 before fcd - try { - decompose(m_buffer_, m_source_, m_FCDStart_, m_FCDLimit_, - m_collator_.m_decomposition_); - } - catch (ArrayOutOfBoundsException e) { - // increase the size of the buffer - m_buffer_ = new char[m_buffer_.length << 1]; - decompose(m_buffer_, m_source_, m_FCDStart_, m_FCDLimit_, - m_collator_.m_decomposition_); - } - */ - m_bufferOffset_ = 0; + int size = m_FCDLimit_ - m_FCDStart_; + m_buffer_.delete(0, m_buffer_.length()); + m_source_.setIndex(m_FCDStart_); + for (int i = 0; i < size; i ++) { + m_buffer_.append(m_source_.current()); + m_source_.next(); + } + String decomp = Normalizer.decompose(m_buffer_.toString(), false); + m_buffer_.delete(0, m_buffer_.length()); + m_buffer_.append(decomp); + m_bufferOffset_ = 0; } /** @@ -811,24 +928,22 @@ public final class CollationElementIterator { boolean result = true; - // srcP = collationSource->pos-1; - - // Get the trailing combining class of the current character. + // Get the trailing combining class of the current character. // If it's zero, we are OK. m_FCDStart_ = offset; m_source_.setIndex(offset); // trie access - char fcd = 0; // synwee todo: unorm_getFCD16(ch); + char fcd = NormalizerImpl.getFCD16(ch); if (fcd != 0 && UTF16.isLeadSurrogate(ch)) { ch = m_source_.next(); // CharacterIterator.DONE has 0 fcd if (UTF16.isTrailSurrogate(ch)) { - fcd = 0xFFFF; // unorm_getFCD16FromSurrogatePair(fcd, ch); + fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch); } else { fcd = 0; } } - byte prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_); + int prevTrailCC = fcd & LAST_BYTE_MASK_; if (prevTrailCC != 0) { // The current char has a non-zero trailing CC. Scan forward until @@ -839,16 +954,16 @@ public final class CollationElementIterator break; } // trie access - fcd = 0; // unorm_getFCD16(ch); + fcd = NormalizerImpl.getFCD16(ch); if (fcd != 0 && UTF16.isLeadSurrogate(ch)) { ch = m_source_.next(); if (UTF16.isTrailSurrogate(ch)) { - fcd = 0xFFFF; // unorm_getFCD16FromSurrogatePair(fcd, ch); + fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, ch); } else { fcd = 0; } } - byte leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_); + int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_; if (leadCC == 0) { // this is a base character, we stop the FCD checks break; @@ -858,12 +973,12 @@ public final class CollationElementIterator result = false; } - prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_); + prevTrailCC = fcd & LAST_BYTE_MASK_; } } + m_FCDLimit_ = m_source_.getIndex(); m_source_.setIndex(m_FCDStart_); m_source_.next(); - m_FCDLimit_ = m_source_.getIndex(); return result; } @@ -885,8 +1000,7 @@ public final class CollationElementIterator } else { // we are in the buffer, buffer offset will never be 0 here - result = m_buffer_.charAt(m_bufferOffset_ ++); - if (result == 0) { + if (m_bufferOffset_ >= m_buffer_.length()) { // Null marked end of buffer, revert to the source string and // loop back to top to try again to get a character. m_source_.setIndex(m_FCDLimit_); @@ -894,10 +1008,10 @@ public final class CollationElementIterator m_buffer_.delete(0, m_buffer_.length()); return nextChar(); } - return result; + return m_buffer_.charAt(m_bufferOffset_ ++); } - if (m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION + if (m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION || m_bufferOffset_ != -1 || m_FCDLimit_ > startoffset // skip the fcd checks || result < FULL_ZERO_COMBINING_CLASS_FAST_LIMIT_ @@ -934,20 +1048,10 @@ public final class CollationElementIterator * the buffer. * Source offsets points to the current processing character. */ - public void normalizeBackwards() + private void normalizeBackwards() { - int start = m_FCDStart_; - int size = 0; - /* synwee todo normalize including fcd - try { - size = decompose(m_buffer_, m_source_, start, m_FCDLimit_); - } - catch (ArrayOutOfBoundsException .) { - m_buffer_ = new char[m_buffer_.length << 1]; - size = decompose(m_buffer_, m_source_, start, m_FCDLimit); - } - */ - m_bufferOffset_ = size - 1; + normalize(); + m_bufferOffset_ = m_buffer_.length(); } /** @@ -972,18 +1076,20 @@ public final class CollationElementIterator { boolean result = true; char fcd = 0; - m_FCDLimit_ = offset; + m_FCDLimit_ = offset + 1; m_source_.setIndex(offset); if (!UTF16.isSurrogate(ch)) { - fcd = 0; // synwee todo unorm_getFCD16(fcdTrieIndex, c); + fcd = NormalizerImpl.getFCD16(ch); } else if (UTF16.isTrailSurrogate(ch) && m_FCDLimit_ > 0) { // note trail surrogate characters gets 0 fcd + char trailch = ch; ch = m_source_.previous(); if (UTF16.isLeadSurrogate(ch)) { - fcd = 0; // unorm_getFCD16(fcdTrieIndex, c2); + fcd = NormalizerImpl.getFCD16(ch); if (fcd != 0) { - fcd = 0; // unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); + fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, + trailch); } } else { @@ -991,44 +1097,47 @@ public final class CollationElementIterator } } - byte leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_); - if (leadCC != 0) { - // The current char has a non-zero leading combining class. - // Scan backward until we find a char with a trailing cc of zero. - while (true) { - if (m_source_.getIndex() == 0) { - break; - } + int leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_; + // The current char has a non-zero leading combining class. + // Scan backward until we find a char with a trailing cc of zero. + + while (leadCC != 0) { + offset = m_source_.getIndex(); + if (offset == 0) { + break; + } + ch = m_source_.previous(); + if (!UTF16.isSurrogate(ch)) { + fcd = NormalizerImpl.getFCD16(ch); + } + else if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) { + char trail = ch; ch = m_source_.previous(); - if (!UTF16.isSurrogate(ch)) { - fcd = 0; //unorm_getFCD16(fcdTrieIndex, c); - } - else { - if (UTF16.isTrailSurrogate(ch) && m_source_.getIndex() > 0) - { - ch = m_source_.previous(); - if (UTF16.isLeadSurrogate(ch)) { - fcd = 0; // unorm_getFCD16(fcdTrieIndex, c2); - } - if (fcd != 0) { - fcd = 0; // unorm_getFCD16FromSurrogatePair(fcdTrieIndex, fcd, c); - } - } else { - fcd = 0; // unpaired surrogate - } - byte prevTrailCC = (byte)(fcd & LAST_BYTE_MASK_); - if (prevTrailCC == 0) { - break; - } - - if (leadCC < prevTrailCC) { - result = false; - } - leadCC = (byte)(fcd >> SECOND_LAST_BYTE_SHIFT_); - } - } + if (UTF16.isLeadSurrogate(ch)) { + fcd = NormalizerImpl.getFCD16(ch); + } + if (fcd != 0) { + fcd = NormalizerImpl.getFCD16FromSurrogatePair(fcd, trail); + } + } + else { + fcd = 0; // unpaired surrogate + } + int prevTrailCC = fcd & LAST_BYTE_MASK_; + if (leadCC < prevTrailCC) { + result = false; + } + leadCC = fcd >>> SECOND_LAST_BYTE_SHIFT_; } - m_FCDStart_ = m_source_.getIndex(); // character with 0 lead/trail fcd + + // storing character with 0 lead fcd or the 1st accent with a base + // character before it + if (fcd == 0) { + m_FCDStart_ = offset; + } + else { + m_FCDStart_ = m_source_.getIndex(); + } m_source_.setIndex(m_FCDLimit_); return result; } @@ -1062,7 +1171,7 @@ public final class CollationElementIterator char result = m_source_.previous(); int startoffset = m_source_.getIndex(); if (result < LEAD_ZERO_COMBINING_CLASS_FAST_LIMIT_ - || m_collator_.m_decomposition_ == Collator.NO_DECOMPOSITION + || m_collator_.getDecomposition() == Collator.NO_DECOMPOSITION || m_FCDStart_ <= startoffset || m_source_.getIndex() == 0) { return result; } @@ -1073,7 +1182,7 @@ public final class CollationElementIterator return result; } // Need a more complete FCD check and possible normalization. - if (!FCDCheckBackwards(ch, startoffset)) { + if (!FCDCheckBackwards(result, startoffset)) { normalizeBackwards(); m_bufferOffset_ --; result = m_buffer_.charAt(m_bufferOffset_); @@ -1085,52 +1194,17 @@ public final class CollationElementIterator * Determines if it is at the start of source iteration * @return true if iterator at the start, false otherwise */ - private boolean isBackwardsStart() + private final boolean isBackwardsStart() { return (m_bufferOffset_ < 0 && m_source_.getIndex() == 0) || (m_bufferOffset_ == 0 && m_FCDStart_ <= 0); } - /** - * Determine if a character is a Thai vowel, which sorts after its base - * consonant. - * @param ch character to test - * @return true if ch is a Thai prevowel, false otherwise - */ - private boolean isThaiPreVowel(char ch) - { - return (ch >= 0xe40 && ch <= 0xe44) || (ch >= 0xec0 && ch <= 0xec4); - } - - /** - * Determine if a character is a Thai base consonant, which sorts before - * its prevowel - * @param ch character to test - * @return true if ch is a Thai base consonant, false otherwise - */ - private boolean isThaiBaseConsonant(char ch) - { - return ch >= 0xe01 && ch <= 0xe2e; - } - - - /** - * Determine if a character is a Jamo - * @param ch character to test - * @return true if ch is a Jamo, false otherwise - */ - private boolean isJamo(char ch) - { - return (ch - 0x1100 <= 0x1112 - 0x1100) - || (ch - 0x1161 <= 0x1175 - 0x1161) - || (ch - 0x11A8 <= 0x11C2 - 0x11A8); - } - /** * Checks if iterator is at the end of its source string. * @return true if it is at the end, false otherwise */ - private boolean isEnd() + private final boolean isEnd() { if (m_bufferOffset_ >= 0) { if (m_bufferOffset_ != m_buffer_.length()) { @@ -1155,7 +1229,8 @@ public final class CollationElementIterator * @param trail character * @return next CE for the surrogate characters */ - private int nextSurrogate(RuleBasedCollator collator, int ce, char trail) + private final int nextSurrogate(RuleBasedCollator collator, int ce, + char trail) { if (!UTF16.isTrailSurrogate(trail)) { updateInternalState(m_backup_); @@ -1188,7 +1263,7 @@ public final class CollationElementIterator * @param ch current character * @return next CE for Thai characters */ - private int nextThai(RuleBasedCollator collator, int ce, char ch) + private int nextThai(RuleBasedCollator collator, int ce, char ch) { if (m_bufferOffset_ != -1 // already swapped || isEnd() || !isThaiBaseConsonant(m_source_.current())) { @@ -1430,6 +1505,7 @@ public final class CollationElementIterator * @param collator collator to use * @param ce current ce * @param entrybackup entry backup iterator status + * @return ce of the next contraction */ private int nextContraction(RuleBasedCollator collator, int ce) { @@ -1895,7 +1971,7 @@ public final class CollationElementIterator return collator.m_contractionCE_[entryoffset]; } StringBuffer buffer = new StringBuffer(); - while (collator.isUnsafe(ch)) { + while (collator.isUnsafe(ch) || isThaiBaseConsonant(ch)) { buffer.insert(0, ch); ch = previousChar(); if (isBackwardsStart()) { diff --git a/icu4j/src/com/ibm/icu/text/CollationKey.java b/icu4j/src/com/ibm/icu/text/CollationKey.java index f3b2480b58d..48e74f46b07 100755 --- a/icu4j/src/com/ibm/icu/text/CollationKey.java +++ b/icu4j/src/com/ibm/icu/text/CollationKey.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationKey.java,v $ -* $Date: 2002/05/16 20:04:49 $ -* $Revision: 1.5 $ +* $Date: 2002/06/21 23:56:44 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -15,28 +15,42 @@ package com.ibm.icu.text; import java.util.Arrays; /** - *A CollationKey
represents a String
under the
+ *
+ * A CollationKey
represents a String
under the
* rules of a specific Collator
object. Comparing two
* CollationKey
s returns the relative order of the
- * String
s they represent. Using CollationKey
s to
- * compare String
s is generally faster than using
- * Collator.compare
. Thus, when the String
s must be
- * compared multiple times, for example when sorting a list of
- * String
s. It's more efficient to use CollationKey
s.
+ * String
s they represent.
+ *
+ * CollationKey
instances can not be create directly. Rather,
+ * they are generated by calling Collator.getCollationKey(String)
.
+ * Since the rule set of each Collator differs
, the sort orders of
+ * the same string under two unique Collator
may not be the same.
+ * Hence comparing CollationKey
s generated from different
+ * Collator
objects may not give the right results.
+ *
+ * Similar to CollationKey.compareTo(CollationKey)
,
+ * the method RuleBasedCollator.compare(String, String)
compares
+ * two strings and returns the relative order. During the construction
+ * of a CollationKey
object, the entire source string is examined
+ * and processed into a series of bits that are stored in the
+ * CollationKey
object. Bitwise comparison on the bit sequences
+ * are then performed during CollationKey.compareTo(CollationKey)
.
+ * This comparison could incurr expensive startup costs while creating
+ * the CollationKey
object, but once the objects are created,
+ * binary comparisons are fast, and is recommended when the same strings are
+ * to be compared over and over again.
+ * On the other hand Collator.compare(String, String)
examines
+ * and processes the string only until the first characters differing in order,
+ * and is recommend for use if the String
s are to be compared only
+ * once.
+ *
+ * Details of the composition of the bit sequence is located at + * + * user guide. *
- *You can not create CollationKey
s directly. Rather, generate
- * them by calling Collator.getCollationKey(String)
. You can only
- * compare CollationKey
s generated from the same
- * Collator
object.
Generating a CollationKey
for a String
- * involves examining the entire String
and converting it to
- * series of bits that can be compared bitwise. This allows fast comparisons
- * once the keys are generated. The cost of generating keys is recouped in
- * faster comparisons when String
s need to be compared many
- * times. On the other hand, the result of a comparison is often determined by
- * the first couple of characters of each String
.
- * Collator.compare(String, String)
examines only as many characters as it needs
- * which allows it to be faster when doing single comparisons.
The following example shows how CollationKey
s might be used
* to sort a list of String
s.
@@ -63,7 +77,7 @@ import java.util.Arrays; * System.out.println( keys[2].getSourceString() ); * *- * + * * @see Collator * @see RuleBasedCollator * @author Syn Wee Quek @@ -77,7 +91,7 @@ public final class CollationKey implements Comparable // public getters ------------------------------------------------------- /** - * Returns the String that this CollationKey represents. + * Returns the source string that this CollationKey represents. * @return source string that this CollationKey represents * @draft 2.2 */ @@ -87,11 +101,44 @@ public final class CollationKey implements Comparable } /** - *
Duplicates and returns the value of this CollationKey as a sequence - * of big-endian bytes.
- *If two CollationKeys could be legitimately compared, then one could - * compare the byte arrays of each to obtain the same result.
- * @return CollationKey value in a sequence of big-endian byte bytes. + *+ * Duplicates and returns the value of this CollationKey as a sequence + * of big-endian bytes terminated by a null. + *
+ *+ * If two CollationKeys could be legitimately compared, then one could + * compare the byte arrays of each to obtain the same result. + *
+ * byte key1[] = collationkey1.toByteArray(); + * byte key2[] = collationkey2.toByteArray(); + * int i = 0; + * while (key1[i] != 0 && key2[i] != 0) { + * int key = key1[i] & 0xFF; + * int targetkey = key2[i] & 0xFF; + * if (key < targetkey) { + * System.out.println("String 1 is less than string 2"); + * return; + * } + * if (targetkey < key) { + * System.out.println("String 1 is more than string 2"); + * } + * i ++; + * } + * int key = key1[i] & 0xFF; + * int targetkey = key2[i] & 0xFF; + * if (key < targetkey) { + * System.out.println("String 1 is less than string 2"); + * return; + * } + * if (targetkey < key) { + * System.out.println("String 1 is more than string 2"); + * return; + * } + * System.out.println("String 1 is equals to string 2");; + *+ * + * @return CollationKey value in a sequence of big-endian byte bytes + * terminated by a null. * @draft 2.2 */ public byte[] toByteArray() @@ -112,15 +159,22 @@ public final class CollationKey implements Comparable // public other methods ------------------------------------------------- /** - *
Compare this CollationKey to the target CollationKey. The collation - * rules of the Collator object which created these keys are applied.
- *Note: CollationKeys created by different Collators - * can not be compared.
+ *+ * Compare this CollationKey to the argument target CollationKey. + * The collation + * rules of the Collator object which created these keys are applied. + *
+ *+ * Note: Comparison between CollationKeys created by + * different Collators may not return the correct result. See class + * documentation. + *
* @param target target CollationKey * @return an integer value, if value is less than zero this CollationKey * is less than than target, if value is zero if they are equal * and value is greater than zero if this CollationKey is greater * than target. + * @exception NullPointerException thrown when argument is null. * @see Collator#compare(String, String) * @draft 2.2 */ @@ -151,13 +205,21 @@ public final class CollationKey implements Comparable } /** - *Compares this CollationKey with the specified Object.
+ *+ * Compares this CollationKey with the specified Object. + * The collation + * rules of the Collator object which created these objects are applied. + *
+ *+ * See note in compareTo(CollationKey) for warnings of incorrect results + *
* @param obj the Object to be compared. * @return Returns a negative integer, zero, or a positive integer * respectively if this CollationKey is less than, equal to, or * greater than the given Object. - * @exception ClassCastException thrown when the specified Object is not a - * CollationKey. + * @exception ClassCastException thrown when the specified argument is not + * a CollationKey. NullPointerException thrown when argument + * is null. * @see #compareTo(CollationKey) * @draft 2.2 */ @@ -167,22 +229,52 @@ public final class CollationKey implements Comparable } /** - *Compare this CollationKey and the target CollationKey for equality. + *
+ * Compare this CollationKey and the argument target object for equality. + * The collation + * rules of the Collator object which created these objects are applied. *
- *The collation rules of the Collator object which created these keys - * are applied.
- *Note: CollationKeys created by different Collators - * can not be compared.
- * @param target the CollationKey to compare to. + *+ * See note in compareTo(CollationKey) for warnings of incorrect results + *
+ * @param target the object to compare to. * @return true if two objects are equal, false otherwise. + * @see #compareTo(CollationKey) + * @exception ClassCastException thrown when the specified argument is not + * a CollationKey. NullPointerException thrown when argument + * is null. * @draft 2.2 */ public boolean equals(Object target) + { + if (!(target instanceof CollationKey)) { + return false; + } + + return equals((CollationKey)target); + } + + /** + *+ * Compare this CollationKey and the argument target CollationKey for + * equality. + * The collation + * rules of the Collator object which created these objects are applied. + *
+ *+ * See note in compareTo(CollationKey) for warnings of incorrect results + *
+ * @param target the CollationKey to compare to. + * @return true if two objects are equal, false otherwise. + * @exception NullPointerException thrown when argument is null. + * @draft 2.2 + */ + public boolean equals(CollationKey target) { if (this == target) { return true; } - if (target == null || !(target instanceof CollationKey)) { + if (target == null) { return false; } CollationKey other = (CollationKey)target; @@ -200,12 +292,13 @@ public final class CollationKey implements Comparable } /** - *Creates a hash code for this CollationKey. The hash value is - * calculated on the key itself, not the String from which the key was - * created. Thus if x and y are CollationKeys, then - * x.hashCode(x) == y.hashCode() if x.equals(y) is true. This allows - * language-sensitive comparison in a hash table.
- *See the CollatinKey class description for an example.
+ *+ * Creates a hash code for this CollationKey. The hash value is calculated + * on the key itself, not the String from which the key was created. Thus + * if x and y are CollationKeys, then x.hashCode(x) == y.hashCode() + * if x.equals(y) is true. This allows language-sensitive comparison in a + * hash table. + *
* @return the hash value. * @draft 2.2 */ diff --git a/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java b/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java new file mode 100644 index 00000000000..8276a24f357 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java @@ -0,0 +1,3487 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2002, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java,v $ +* $Date: 2002/06/21 23:57:55 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ +package com.ibm.icu.text; + +import java.io.InputStream; +import java.io.BufferedInputStream; +import java.text.ParseException; +import java.util.Hashtable; +import java.util.Vector; +import java.util.Arrays; +import java.util.Enumeration; + +import com.ibm.icu.dev.test.lang.UCharacterCaseTest; +import com.ibm.icu.impl.TrieBuilder; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.impl.NormalizerImpl; +import com.ibm.icu.util.RangeValueIterator; + +/** +* Class for building a collator from a list of collation rules. +* This class is uses CollationRuleParser +* @author Syn Wee Quek +* @since release 2.2, June 11 2002 +* @draft 2.2 +*/ +class CollationParsedRuleBuilder +{ + // package private constructors ------------------------------------------ + + /** + * Constructor + * @param rules collation rules + * @exception ParseException thrown when argument rules have an invalid + * syntax + */ + CollationParsedRuleBuilder(String rules) throws ParseException + { + m_parser_ = new CollationRuleParser(rules); + m_utilColEIter_ = RuleBasedCollator.UCA_.getCollationElementIterator( + ""); + } + + // package private inner classes ----------------------------------------- + + /** + * Inverse UCA wrapper + */ + static class InverseUCA + { + // package private constructor --------------------------------------- + + InverseUCA() + { + } + + // package private data member --------------------------------------- + + /** + * Array list of characters + */ + int m_table_[]; + /** + * Array list of continuation characters + */ + char m_continuations_[]; + + // package private method -------------------------------------------- + + /** + * Returns the previous inverse ces of the argument ces + * @param ce ce to test + * @param contce continuation ce to test + * @param strength collation strength + * @param result an array to store the return results of inverse ce, + * previous inverse ce and previous inverse continuation ce + */ + final void getInversePrevCE(int ce, int contce, int strength, + int result[]) + { + int ice = findInverseCE(ce, contce); + + if (ice < 0) { + result[0] = -1; + result[1] = CollationElementIterator.NULLORDER; + return; + } + + ce &= STRENGTH_MASK_[strength]; + contce &= STRENGTH_MASK_[strength]; + + result[1] = ce; + result[2] = contce; + + while ((result[1] & STRENGTH_MASK_[strength]) == ce + && (result[2] & STRENGTH_MASK_[strength])== contce + && ice > 0) { + // this condition should prevent falling off the edge of the + // world + // here, we end up in a singularity - zero + result[1] = m_table_[3 * (-- ice)]; + result[2] = m_table_[3 * ice + 1]; + } + } + + /** + * Finding the inverse CE of the argument CEs + * @param ce CE to be tested + * @param contce continuation CE + * @return inverse CE + */ + int findInverseCE(int ce, int contce) + { + int bottom = 0; + int top = m_table_.length / 3; + int result = 0; + + while (bottom < top - 1) { + result = (top + bottom) >> 1; + int first = m_table_[3 * result]; + int second = m_table_[3 * result + 1]; + if (first > ce) { + top = result; + } + else if (first < ce) { + bottom = result; + } + else { + if (second > contce) { + top = result; + } + else if (second < contce) { + bottom = result; + } + else { + break; + } + } + } + + return result; + } + + /** + * Getting gap offsets in the inverse UCA + * @param listheader parsed token lists + * @exception Exception thrown when error occurs while finding the + * collation gaps + */ + void getInverseGapPositions(CollationRuleParser.TokenListHeader + listheader) + throws Exception + { + // reset all the gaps + CollationRuleParser.Token token = listheader.m_first_; + int tokenstrength = token.m_strength_; + + for (int i = 0; i < 3; i ++) { + listheader.m_gapsHi_[3 * i] = 0; + listheader.m_gapsHi_[3 * i + 1] = 0; + listheader.m_gapsHi_[3 * i + 2] = 0; + listheader.m_gapsLo_[3 * i] = 0; + listheader.m_gapsLo_[3 * i + 1] = 0; + listheader.m_gapsLo_[3 * i + 2] = 0; + listheader.m_numStr_[i] = 0; + listheader.m_fStrToken_[i] = null; + listheader.m_lStrToken_[i] = null; + listheader.m_pos_[i] = -1; + } + + if (listheader.m_baseCE_ >= CE_PRIMARY_IMPLICIT_MIN_ + && listheader.m_baseCE_ < CE_PRIMARY_IMPLICIT_MAX_) { + // implicits - + listheader.m_pos_[0] = 0; + int t1 = listheader.m_baseCE_; + int t2 = listheader.m_baseContCE_; + listheader.m_gapsLo_[0] = mergeCE(t1, t2, + Collator.PRIMARY); + listheader.m_gapsLo_[1] = mergeCE(t1, t2, + Collator.SECONDARY); + listheader.m_gapsLo_[2] = mergeCE(t1, t2, + Collator.TERTIARY); + if (listheader.m_baseCE_ < 0xEF000000) { + // first implicits have three byte primaries, with a gap of + // one so we esentially need to add 2 to the top byte in + // listheader.m_baseContCE_ + t2 += 0x02000000; + } + else { + // second implicits have four byte primaries, with a gap of + // IMPLICIT_LAST2_MULTIPLIER_ + // Now, this guy is not really accessible here, so until we + // find a better way to pass it around, assume that the gap is 1 + t2 += 0x00020000; + } + listheader.m_gapsHi_[0] = mergeCE(t1, t2, + Collator.PRIMARY); + listheader.m_gapsHi_[1] = mergeCE(t1, t2, + Collator.SECONDARY); + listheader.m_gapsHi_[2] = mergeCE(t1, t2, + Collator.TERTIARY); + } + else if (listheader.m_indirect_ == true + && listheader.m_nextCE_ != 0) { + listheader.m_pos_[0] = 0; + int t1 = listheader.m_baseCE_; + int t2 = listheader.m_baseContCE_; + listheader.m_gapsLo_[0] = mergeCE(t1, t2, + Collator.PRIMARY); + listheader.m_gapsLo_[1] = mergeCE(t1, t2, + Collator.SECONDARY); + listheader.m_gapsLo_[2] = mergeCE(t1, t2, + Collator.TERTIARY); + t1 = listheader.m_nextCE_; + t2 = listheader.m_nextContCE_; + listheader.m_gapsHi_[0] = mergeCE(t1, t2, + Collator.PRIMARY); + listheader.m_gapsHi_[1] = mergeCE(t1, t2, + Collator.SECONDARY); + listheader.m_gapsHi_[2] = mergeCE(t1, t2, + Collator.TERTIARY); + } + else { + while (true) { + if (tokenstrength < CE_STRENGTH_LIMIT_) { + listheader.m_pos_[tokenstrength] + = getInverseNext(listheader, + tokenstrength); + if (listheader.m_pos_[tokenstrength] >= 0) { + listheader.m_fStrToken_[tokenstrength] = token; + } + else { + // The CE must be implicit, since it's not in the + // table + // Error + throw new Exception("Internal program error"); + } + } + + while (token != null && token.m_strength_ >= tokenstrength) + { + if (tokenstrength < CE_STRENGTH_LIMIT_) { + listheader.m_lStrToken_[tokenstrength] = token; + } + token = token.m_next_; + } + if (tokenstrength < CE_STRENGTH_LIMIT_ - 1) { + // check if previous interval is the same and merge the + // intervals if it is so + if (listheader.m_pos_[tokenstrength] + == listheader.m_pos_[tokenstrength + 1]) { + listheader.m_fStrToken_[tokenstrength] + = listheader.m_fStrToken_[tokenstrength + + 1]; + listheader.m_fStrToken_[tokenstrength + 1] = null; + listheader.m_lStrToken_[tokenstrength + 1] = null; + listheader.m_pos_[tokenstrength + 1] = -1; + } + } + if (token != null) { + tokenstrength = token.m_strength_; + } + else { + break; + } + } + for (int st = 0; st < 3; st ++) { + int pos = listheader.m_pos_[st]; + if (pos >= 0) { + int t1 = m_table_[3 * pos]; + int t2 = m_table_[3 * pos + 1]; + listheader.m_gapsHi_[3 * st] = mergeCE(t1, t2, + Collator.PRIMARY); + listheader.m_gapsHi_[3 * st + 1] = mergeCE(t1, t2, + Collator.SECONDARY); + listheader.m_gapsHi_[3 * st + 2] = (t1 & 0x3f) << 24 + | (t2 & 0x3f) << 16; + pos --; + t1 = m_table_[3 * pos]; + t2 = m_table_[3 * pos + 1]; + listheader.m_gapsLo_[3 * st] = mergeCE(t1, t2, + Collator.PRIMARY); + listheader.m_gapsLo_[3 * st + 1] = mergeCE(t1, t2, + Collator.SECONDARY); + listheader.m_gapsLo_[3 * st + 2] = (t1 & 0x3f) << 24 + | (t2 & 0x3f) << 16; + } + } + } + } + + /** + * Gets the next CE in the inverse table + * @param listheader token list header + * @param strength collation strength + * @return next ce + */ + private final int getInverseNext(CollationRuleParser.TokenListHeader + listheader, + int strength) + { + int ce = listheader.m_baseCE_; + int secondce = listheader.m_baseContCE_; + int result = findInverseCE(ce, secondce); + + if (result < 0) { + return -1; + } + + ce &= STRENGTH_MASK_[strength]; + secondce &= STRENGTH_MASK_[strength]; + + int nextce = ce; + int nextcontce = secondce; + + while((nextce & STRENGTH_MASK_[strength]) == ce + && (nextcontce & STRENGTH_MASK_[strength]) == secondce) { + nextce = m_table_[3 * (++ result)]; + nextcontce = m_table_[3 * result + 1]; + } + + listheader.m_nextCE_ = nextce; + listheader.m_nextContCE_ = nextcontce; + + return result; + } + } + + // package private data members ------------------------------------------ + + /** + * Inverse UCA, instantiate only when required + */ + static final InverseUCA INVERSE_UCA_; + + /** + * Initializing the inverse UCA + */ + static { + try + { + String invdat = "/com/ibm/icu/impl/data/invuca.dat"; + InputStream i = invdat.getClass().getResourceAsStream(invdat); + BufferedInputStream b = new BufferedInputStream(i, 110000); + INVERSE_UCA_ = CollatorReader.readInverseUCA(b); + b.close(); + i.close(); + } + catch (Exception e) + { + e.printStackTrace(); + throw new RuntimeException(e.getMessage()); + } + } + + // package private methods ----------------------------------------------- + + /** + * Parse and sets the collation rules in the argument collator + * @param collator to set + * @exception Exception thrown when internal program error occurs + */ + void setRules(RuleBasedCollator collator) throws Exception + { + if (m_parser_.m_resultLength_ > 0) { + // we have a set of rules, let's make something of it + assembleTailoringTable(collator); + } + else { // no rules, but no error either must be only options + // We will init the collator from UCA + collator.setWithUCATables(); + // And set only the options + m_parser_.setDefaultOptionsInCollator(collator); + } + } + + /** + * 2. Eliminate the negative lists by doing the following for each + * non-null negative list: + * o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, + * create new ListHeader X + * o reverse the list, add to the end of X's positive list. Reset the + * strength of the first item you add, based on the stronger strength + * levels of the two lists. + * + * 3. For each ListHeader with a non-null positive list: + * o Find all character strings with CEs between the baseCE and the + * next/previous CE, at the strength of the first token. Add these to the + * tailoring. + * ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the + * tailoring has & x < z... + * ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... + * + * It is possible that this part should be done even while constructing list + * The problem is that it is unknown what is going to be the strongest + * weight. + * So we might as well do it here + * o Allocate CEs for each token in the list, based on the total number N + * of the largest level difference, and the gap G between baseCE and nextCE + * at that level. The relation * between the last item and nextCE is the + * same as the strongest strength. + * o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) + * ? There are 3 primary items: a, d, e. Fit them into the primary gap. + * Then fit b and c into the secondary gap between a and d, then fit q + * into the tertiary gap between b and c. + * o Example: baseCE << b <<< q << c * nextCE(X,2) + * ? There are 2 secondary items: b, c. Fit them into the secondary gap. + * Then fit q into the tertiary gap between b and c. + * o When incrementing primary values, we will not cross high byte + * boundaries except where there is only a single-byte primary. That is + * to ensure that the script reordering will continue to work. + * @param collator the rule based collator to update + * @exception Exception thrown when internal program error occurs + */ + void assembleTailoringTable(RuleBasedCollator collator) throws Exception + { + + for (int i = 0; i < m_parser_.m_resultLength_; i ++) { + // now we need to generate the CEs + // We stuff the initial value in the buffers, and increase the + // appropriate buffer according to strength */ + initBuffers(m_parser_.m_listHeader_[i]); + } + + if (m_parser_.m_variableTop_ != null) { + // stuff the variable top value + m_parser_.m_options_.m_variableTopValue_ + = m_parser_.m_variableTop_.m_CE_[0] >>> 16; + // remove it from the list + if (m_parser_.m_variableTop_.m_listHeader_.m_first_ + == m_parser_.m_variableTop_) { // first in list + m_parser_.m_variableTop_.m_listHeader_.m_first_ + = m_parser_.m_variableTop_.m_next_; + } + if (m_parser_.m_variableTop_.m_listHeader_.m_last_ + == m_parser_.m_variableTop_) { + // first in list + m_parser_.m_variableTop_.m_listHeader_.m_last_ + = m_parser_.m_variableTop_.m_previous_; + } + if (m_parser_.m_variableTop_.m_next_ != null) { + m_parser_.m_variableTop_.m_next_.m_previous_ + = m_parser_.m_variableTop_.m_previous_; + } + if (m_parser_.m_variableTop_.m_previous_ != null) { + m_parser_.m_variableTop_.m_previous_.m_next_ + = m_parser_.m_variableTop_.m_next_; + } + } + + + BuildTable t = new BuildTable(m_parser_); + + // After this, we have assigned CE values to all regular CEs now we + // will go through list once more and resolve expansions, make + // UCAElements structs and add them to table + for (int i = 0; i < m_parser_.m_resultLength_; i ++) { + // now we need to generate the CEs + // We stuff the initial value in the buffers, and increase the + // appropriate buffer according to strength */ + createElements(t, m_parser_.m_listHeader_[i]); + } + + Elements el = new Elements(); + el.m_isThai_ = false; + el.m_prefixChars_ = null; + int ce[] = new int[256]; + StringBuffer str = new StringBuffer(); + + // add latin-1 stuff + for (char u = 0; u < 0x100; u ++) { + // if ((CE = ucmpe32_get(t.m_mapping, u)) == UCOL_NOT_FOUND + int CE = TrieBuilder.get32(t.m_mapping_, (int)u); + if (CE == CE_NOT_FOUND_ + // this test is for contractions that are missing the starting + // element. Looks like latin-1 should be done before + // assembling the table, even if it results in more false + // closure elements + || (isContractionTableElement(CE) + && getCE(t.m_contractions_, CE, 0) == CE_NOT_FOUND_)) { + str.delete(0, str.length()); + str.append(u); + el.m_uchars_ = str.toString(); + el.m_cPoints_ = el.m_uchars_; + el.m_prefix_ = 0; + int ceoffset = 0; + m_utilColEIter_.setText(el.m_uchars_); + while (CE != CollationElementIterator.NULLORDER) { + CE = m_utilColEIter_.next(); + if (CE != CollationElementIterator.NULLORDER) { + ce[ceoffset ++] = CE; + } + } + el.m_CEs_ = new int[ceoffset]; + System.arraycopy(ce, 0, el.m_CEs_, 0, ceoffset); + addAnElement(t, el); + } + } + + // copy contractions from the UCA - this is felt mostly for cyrillic + char conts[] = RuleBasedCollator.UCA_.m_UCAContraction_; + int offset = 0; + while (conts[offset] != 0) { + // tailoredCE = ucmpe32_get(t.m_mapping, *conts); + int tailoredCE = TrieBuilder.get32(t.m_mapping_, conts[offset]); + if (tailoredCE != CE_NOT_FOUND_) { + boolean needToAdd = true; + if (isContractionTableElement(tailoredCE)) { + if (isTailored(t.m_contractions_, tailoredCE, + conts, offset + 1) == true) { + needToAdd = false; + } + } + if (needToAdd == true) { + // we need to add if this contraction is not tailored. + el.m_prefix_ = 0; + el.m_prefixChars_ = null; + el.m_cPoints_ = el.m_uchars_; + str.delete(0, str.length()); + str.append(conts[offset]); + str.append(conts[offset + 1]); + if (conts[offset + 2] != 0) { + str.append(conts[offset + 2]); + } + el.m_uchars_ = str.toString(); + int ceoffset = 0; + m_utilColEIter_.setText(el.m_uchars_); + while (true) { + int CE = m_utilColEIter_.next(); + if (CE != CollationElementIterator.NULLORDER) { + ce[ceoffset ++] = CE; + } + else { + break; + } + } + el.m_CEs_ = new int[ceoffset]; + System.arraycopy(ce, 0, el.m_CEs_, 0, ceoffset); + addAnElement(t, el); + } + } + offset += 3; + } + + BuildTable temp = new BuildTable(t); + assembleTable(temp, temp.m_collator_); + // produce canonical closure + CollationElementIterator coleiter + = temp.m_collator_.getCollationElementIterator(""); + RangeValueIterator typeiter = UCharacter.getTypeIterator(); + RangeValueIterator.Element element = new RangeValueIterator.Element(); + while (typeiter.next(element)) { + _enumCategoryRangeClosureCategory(t, temp.m_collator_, coleiter, + element.start, element.limit, + element.value); + } + // still need to produce compatibility closure + assembleTable(t, collator); + } + + // private inner classes ------------------------------------------------- + + private static class CEGenerator + { + // package private data members -------------------------------------- + + WeightRange m_ranges_[]; + int m_rangesLength_; + int m_byteSize_; + int m_start_; + int m_limit_; + int m_maxCount_; + int m_count_; + int m_current_; + int m_fLow_; // forbidden Low + int m_fHigh_; // forbidden High + + // package private constructor --------------------------------------- + + CEGenerator() + { + m_ranges_ = new WeightRange[7]; + for (int i = 6; i >= 0; i --) { + m_ranges_[i] = new WeightRange(); + } + } + }; + + private static class WeightRange implements Comparable + { + // public methods ---------------------------------------------------- + + /** + * Compares this object with target + * @param target object to compare with + * @return 0 if equals, 1 if this is > target, -1 otherwise + */ + public int compareTo(Object target) + { + if (this == target) { + return 0; + } + int tstart = ((WeightRange)target).m_start_; + if (m_start_ == tstart) { + return 0; + } + if (m_start_ > tstart) { + return 1; + } + return -1; + } + + // package private data members -------------------------------------- + + int m_start_; + int m_end_; + int m_length_; + int m_count_; + int m_length2_; + int m_count2_; + + // package private constructor --------------------------------------- + + WeightRange() + { + m_start_ = 0; + m_end_ = 0; + m_length_ = 0; + m_count_ = 0; + m_length2_ = 0; + m_count2_ = 0; + } + }; + + private static class MaxJamoExpansionTable + { + // package private data members -------------------------------------- + + Vector m_endExpansionCE_; + // vector of booleans + Vector m_isV_; + byte m_maxLSize_; + byte m_maxVSize_; + byte m_maxTSize_; + + // package private constructor --------------------------------------- + + MaxJamoExpansionTable() + { + m_endExpansionCE_ = new Vector(); + m_isV_ = new Vector(); + m_endExpansionCE_.add(new Integer(0)); + m_isV_.add(new Integer(0)); + m_maxLSize_ = 1; + m_maxVSize_ = 1; + m_maxTSize_ = 1; + } + + MaxJamoExpansionTable(MaxJamoExpansionTable table) + { + m_endExpansionCE_ = (Vector)table.m_endExpansionCE_.clone(); + m_isV_ = (Vector)table.m_isV_.clone(); + m_maxLSize_ = table.m_maxLSize_; + m_maxVSize_ = table.m_maxVSize_; + m_maxTSize_ = table.m_maxTSize_; + } + }; + + private static class MaxExpansionTable + { + // package private constructor -------------------------------------- + + MaxExpansionTable() + { + m_endExpansionCE_ = new Vector(); + m_expansionCESize_ = new Vector(); + m_endExpansionCE_.add(new Integer(0)); + m_expansionCESize_.add(new Byte((byte)0)); + } + + MaxExpansionTable(MaxExpansionTable table) + { + m_endExpansionCE_ = (Vector)table.m_endExpansionCE_.clone(); + m_expansionCESize_ = (Vector)table.m_expansionCESize_.clone(); + } + + // package private data member -------------------------------------- + + Vector m_endExpansionCE_; + Vector m_expansionCESize_; + }; + + private static class BasicContractionTable + { + // package private constructors ------------------------------------- + + BasicContractionTable() + { + m_CEs_ = new Vector(); + m_codePoints_ = new StringBuffer(); + } + + // package private data members ------------------------------------- + + StringBuffer m_codePoints_; + Vector m_CEs_; + }; + + private static class ContractionTable + { + // package private constructor -------------------------------------- + + /** + * Builds a contraction table + * @param buildtable + */ + ContractionTable(TrieBuilder.BuildTable mapping) + { + m_mapping_ = mapping; + m_elements_ = new Vector(); + m_CEs_ = new Vector(); + m_codePoints_ = new StringBuffer(); + m_offsets_ = new Vector(); + m_currentTag_ = CE_NOT_FOUND_TAG_; + } + + /** + * Copies a contraction table. + * Not all data will be copied into their own object. + * @param table + */ + ContractionTable(ContractionTable table) + { + m_mapping_ = table.m_mapping_; + m_elements_ = (Vector)table.m_elements_.clone(); + m_codePoints_ = new StringBuffer(table.m_codePoints_.toString()); + m_CEs_ = (Vector)table.m_CEs_.clone(); + m_offsets_ = (Vector)table.m_offsets_.clone(); + m_currentTag_ = table.m_currentTag_; + } + + // package private data members ------------------------------------ + + /** + * Vector of BasicContractionTable + */ + Vector m_elements_; + TrieBuilder.BuildTable m_mapping_; + StringBuffer m_codePoints_; + Vector m_CEs_; + Vector m_offsets_; + int m_currentTag_; + }; + + private static class BuildTable + { + // package private constructor -------------------------------------- + + /** + * Returns a table + * @return build table + */ + BuildTable(CollationRuleParser parser) + { + m_collator_ = new RuleBasedCollator(); + m_collator_.setWithUCAData(); + MaxExpansionTable maxet = new MaxExpansionTable(); + MaxJamoExpansionTable maxjet = new MaxJamoExpansionTable(); + m_options_ = parser.m_options_; + m_expansions_ = new Vector(); + // Do your own mallocs for the structure, array and have linear + // Latin 1 + m_mapping_ = TrieBuilder.open(null, null, 0x100000, + RuleBasedCollator.CE_SPECIAL_FLAG_ + | (CE_NOT_FOUND_TAG_ << 24), + true); + m_prefixLookup_ = new Hashtable(); + // uhash_open(prefixLookupHash, prefixLookupComp); + m_contractions_ = new ContractionTable(m_mapping_); + // copy UCA's maxexpansion and merge as we go along + m_maxExpansions_ = maxet; + // adding an extra initial value for easier manipulation + for (int i = 0; + i < RuleBasedCollator.UCA_.m_expansionEndCE_.length; i ++) { + maxet.m_endExpansionCE_.add(new Integer( + RuleBasedCollator.UCA_.m_expansionEndCE_[i])); + maxet.m_expansionCESize_.add(new Byte( + RuleBasedCollator.UCA_.m_expansionEndCEMaxSize_[i])); + } + m_maxJamoExpansions_ = maxjet; + + m_unsafeCP_ = new byte[UNSAFECP_TABLE_SIZE_]; + m_contrEndCP_ = new byte[UNSAFECP_TABLE_SIZE_]; + Arrays.fill(m_unsafeCP_, (byte)0); + Arrays.fill(m_contrEndCP_, (byte)0); + } + + /** + * Duplicating a BuildTable. + * Not all data will be duplicated into their own object. + * @param table to clone + */ + BuildTable(BuildTable table) + { + m_collator_ = table.m_collator_; + m_mapping_ = new TrieBuilder.BuildTable(table.m_mapping_); + m_expansions_ = (Vector)table.m_expansions_.clone(); + m_contractions_ = new ContractionTable(table.m_contractions_); + m_contractions_.m_mapping_ = m_mapping_; + m_options_ = table.m_options_; + m_maxExpansions_ = new MaxExpansionTable(table.m_maxExpansions_); + m_maxJamoExpansions_ + = new MaxJamoExpansionTable(table.m_maxJamoExpansions_); + m_unsafeCP_ = new byte[table.m_unsafeCP_.length]; + System.arraycopy(table.m_unsafeCP_, 0, m_unsafeCP_, 0, + m_unsafeCP_.length); + m_contrEndCP_ = new byte[table.m_contrEndCP_.length]; + System.arraycopy(table.m_contrEndCP_, 0, m_contrEndCP_, 0, + m_contrEndCP_.length); + } + + // package private data members ------------------------------------- + + RuleBasedCollator m_collator_; + TrieBuilder.BuildTable m_mapping_; + Vector m_expansions_; + ContractionTable m_contractions_; + // UCATableHeader image; + CollationRuleParser.OptionSet m_options_; + MaxExpansionTable m_maxExpansions_; + MaxJamoExpansionTable m_maxJamoExpansions_; + byte m_unsafeCP_[]; + byte m_contrEndCP_[]; + Hashtable m_prefixLookup_; + }; + + private static class Elements + { + // package private data members ------------------------------------- + + String m_prefixChars_; + int m_prefix_; + String m_uchars_; + /** + * Working string + */ + String m_cPoints_; + /** + * Offset to the working string + */ + int m_cPointsOffset_; + /** + * These are collation elements - there could be more than one - in + * case of expansion + */ + int m_CEs_[]; + /** + * This is the value element maps in original table + */ + int m_mapCE_; + int m_sizePrim_[]; + int m_sizeSec_[]; + int m_sizeTer_[]; + boolean m_variableTop_; + boolean m_caseBit_; + boolean m_isThai_; + + // package private constructors ------------------------------------- + + /** + * Package private constructor + */ + Elements() + { + m_sizePrim_ = new int[128]; + m_sizeSec_ = new int[128]; + m_sizeTer_ = new int[128]; + } + + /** + * Package private constructor + */ + Elements(Elements element) + { + m_prefixChars_ = element.m_prefixChars_; + m_prefix_ = element.m_prefix_; + m_uchars_ = element.m_uchars_; + m_cPoints_ = element.m_cPoints_; + m_cPointsOffset_ = element.m_cPointsOffset_; + m_CEs_ = element.m_CEs_; + m_mapCE_ = element.m_mapCE_; + m_sizePrim_ = element.m_sizePrim_; + m_sizeSec_ = element.m_sizeSec_; + m_sizeTer_ = element.m_sizeTer_; + m_variableTop_ = element.m_variableTop_; + m_caseBit_ = element.m_caseBit_; + m_isThai_ = element.m_isThai_; + } + + // package private methods ------------------------------------------- + + /** + * Hashcode calculation for token + * @return the hashcode + */ + public int hashCode() + { + String str = m_cPoints_.substring(m_cPointsOffset_); + return str.hashCode(); + } + + /** + * Equals calculation + * @param target object to compare + * @return true if target is the same as this object + */ + public boolean equals(Object target) + { + if (target == this) { + return true; + } + if (target instanceof Elements) { + Elements t = (Elements)target; + int size = m_cPoints_.length() - m_cPointsOffset_; + if (size == t.m_cPoints_.length() - t.m_cPointsOffset_) { + return t.m_cPoints_.regionMatches(t.m_cPointsOffset_, + m_cPoints_, + m_cPointsOffset_, size); + } + } + return false; + } + }; + + // private data member --------------------------------------------------- + + /** + * Maximum strength used in CE building + */ + private static final int CE_BASIC_STRENGTH_LIMIT_ = 3; + /** + * Maximum collation strength + */ + private static final int CE_STRENGTH_LIMIT_ = 16; + /** + * Implicit ce minimum + */ + private static final int CE_PRIMARY_IMPLICIT_MIN_ = 0xE8000000; + private static final int CE_PRIMARY_IMPLICIT_MAX_ = 0xF0000000; + /** + * Strength mask array, used in inverse UCA + */ + private static final int STRENGTH_MASK_[] = {0xFFFF0000, 0xFFFFFF00, + 0xFFFFFFFF}; + /** + * CE tag for not found + */ + private static final int CE_NOT_FOUND_ = 0xF0000000; + /** + * CE tag for not found + */ + private static final int CE_NOT_FOUND_TAG_ = 0; + /** + * This code point results in an expansion + */ + private static final int CE_EXPANSION_TAG_ = 1; + /** + * Start of a contraction + */ + private static final int CE_CONTRACTION_TAG_ = 2; + /** + * Thai character - do the reordering + */ + private static final int CE_THAI_TAG_ = 3; + /** + * Charset processing, not yet implemented + */ + private static final int CE_CHARSET_TAG_ = 4; + /** + * Lead surrogate that is tailored and doesn't start a contraction + */ + private static final int CE_SURROGATE_TAG_ = 5; + /** + * AC00-D7AF + */ + private static final int CE_HANGUL_SYLLABLE_TAG_ = 6; + /** + * D800-DBFF + */ + private static final int CE_LEAD_SURROGATE_TAG_ = 7; + /** + * DC00-DFFF + */ + private static final int CE_TRAIL_SURROGATE_TAG_ = 8; + /** + * 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D + */ + private static final int CE_CJK_IMPLICIT_TAG_ = 9; + private static final int CE_IMPLICIT_TAG_ = 10; + private static final int CE_SPEC_PROC_TAG_ = 11; + /** + * This is a three byte primary with starting secondaries and tertiaries. + * It fits in a single 32 bit CE and is used instead of expansion to save + * space without affecting the performance (hopefully) + */ + private static final int CE_LONG_PRIMARY_TAG_ = 12; + /** + * Unsafe UChar hash table table size. Size is 32 bytes for 1 bit for each + * latin 1 char + some power of two for hashing the rest of the chars. + * Size in bytes + */ + private static final int UNSAFECP_TABLE_SIZE_ = 1056; + /** + * Mask value down to "some power of two" -1. Number of bits, not num of + * bytes. + */ + private static final int UNSAFECP_TABLE_MASK_ = 0x1fff; + /** + * Case values + */ + private static final int UPPER_CASE_ = 0x80; + private static final int MIXED_CASE_ = 0x40; + private static final int LOWER_CASE_ = 0x00; + /** + * Initial table size + */ + private static final int INIT_TABLE_SIZE_ = 1028; + /** + * Header size, copied from ICU4C, to be changed when that value changes + */ + private static final int HEADER_SIZE_ = 0xC4; + /** + * Contraction table new element indicator + */ + private static final int CONTRACTION_TABLE_NEW_ELEMENT_ = 0xFFFFFF; + /** + * Parser for the rules + */ + private CollationRuleParser m_parser_; + /** + * Utility UCA collation element iterator + */ + private CollationElementIterator m_utilColEIter_; + + // private methods ------------------------------------------------------- + + /** + * @param listheader parsed rule tokens + * @exception Exception thrown when internal error occurs + */ + private void initBuffers(CollationRuleParser.TokenListHeader listheader) + throws Exception + { + CEGenerator gens[] = {new CEGenerator(), new CEGenerator(), + new CEGenerator()}; + int ceparts[] = new int[CE_BASIC_STRENGTH_LIMIT_]; + CollationRuleParser.Token token = listheader.m_last_; + int t[] = new int[CE_STRENGTH_LIMIT_]; + Arrays.fill(t, 0, CE_STRENGTH_LIMIT_, 0); + + token.m_toInsert_ = 1; + t[token.m_strength_] = 1; + + while (token.m_previous_ != null) { + if (token.m_previous_.m_strength_ < token.m_strength_) { + // going up + t[token.m_strength_] = 0; + t[token.m_previous_.m_strength_] ++; + } + else if (token.m_previous_.m_strength_ > token.m_strength_) { + // going down + t[token.m_previous_.m_strength_] = 1; + } + else { + t[token.m_strength_] ++; + } + token = token.m_previous_; + token.m_toInsert_ = t[token.m_strength_]; + } + + token.m_toInsert_ = t[token.m_strength_]; + INVERSE_UCA_.getInverseGapPositions(listheader); + + token = listheader.m_first_; + int fstrength = Collator.IDENTICAL; + int initstrength = Collator.IDENTICAL; + + ceparts[Collator.PRIMARY] = mergeCE(listheader.m_baseCE_, + listheader.m_baseContCE_, + Collator.PRIMARY); + ceparts[Collator.SECONDARY] = mergeCE(listheader.m_baseCE_, + listheader.m_baseContCE_, + Collator.SECONDARY); + ceparts[Collator.TERTIARY] = mergeCE(listheader.m_baseCE_, + listheader.m_baseContCE_, + Collator.TERTIARY); + while (token != null) { + fstrength = token.m_strength_; + if (fstrength < initstrength) { + initstrength = fstrength; + if (listheader.m_pos_[fstrength] == -1) { + while (listheader.m_pos_[fstrength] == -1 && fstrength > 0) + { + fstrength--; + } + if (listheader.m_pos_[fstrength] == -1) { + throw new Exception("Internal program error"); + } + } + if (initstrength == Collator.TERTIARY) { + // starting with tertiary + ceparts[Collator.PRIMARY] + = listheader.m_gapsLo_[fstrength * 3]; + ceparts[Collator.SECONDARY] + = listheader.m_gapsLo_[fstrength * 3 + 1]; + ceparts[Collator.TERTIARY] = getCEGenerator( + gens[Collator.TERTIARY], + listheader.m_gapsLo_, + listheader.m_gapsHi_, + token, fstrength); + } + else if (initstrength == Collator.SECONDARY) { + // secondaries + ceparts[Collator.PRIMARY] + = listheader.m_gapsLo_[fstrength * 3]; + ceparts[Collator.SECONDARY] + = getCEGenerator(gens[Collator.SECONDARY], + listheader.m_gapsLo_, + listheader.m_gapsHi_, + token, + fstrength); + ceparts[Collator.TERTIARY] = getSimpleCEGenerator( + gens[Collator.TERTIARY], + token, + Collator.TERTIARY); + } + else { + // primaries + ceparts[Collator.PRIMARY] = getCEGenerator( + gens[Collator.PRIMARY], + listheader.m_gapsLo_, + listheader.m_gapsHi_, + token, fstrength); + ceparts[Collator.SECONDARY] = getSimpleCEGenerator( + gens[Collator.SECONDARY], + token, + Collator.SECONDARY); + ceparts[Collator.TERTIARY] = getSimpleCEGenerator( + gens[Collator.TERTIARY], + token, + Collator.TERTIARY); + } + } + else { + if (token.m_strength_ == Collator.TERTIARY) { + ceparts[Collator.TERTIARY] = getNextGenerated( + gens[Collator.TERTIARY]); + } + else if (token.m_strength_ == Collator.SECONDARY) { + ceparts[Collator.SECONDARY] = getNextGenerated( + gens[Collator.SECONDARY]); + ceparts[Collator.TERTIARY] = getSimpleCEGenerator( + gens[Collator.TERTIARY], + token, + Collator.TERTIARY); + } + else if (token.m_strength_ == Collator.PRIMARY) { + ceparts[Collator.PRIMARY] = getNextGenerated( + gens[Collator.PRIMARY]); + ceparts[Collator.SECONDARY] = getSimpleCEGenerator( + gens[Collator.SECONDARY], + token, + Collator.SECONDARY); + ceparts[Collator.TERTIARY] = getSimpleCEGenerator( + gens[Collator.TERTIARY], + token, + Collator.TERTIARY); + } + } + doCE(ceparts, token); + token = token.m_next_; + } + } + + /** + * Get the next generated ce + * @param g ce generator + * @return next generated ce + */ + private int getNextGenerated(CEGenerator g) + { + g.m_current_ = nextWeight(g); + return g.m_current_; + } + + /** + * @param g CEGenerator + * @param token rule token + * @param fstrength + * @return ce generator + * @exception Exception thrown when internal error occurs + */ + private int getSimpleCEGenerator(CEGenerator g, + CollationRuleParser.Token token, + int strength) throws Exception + { + int high, low, count = 1; + int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF; + + if (strength == Collator.SECONDARY) { + low = RuleBasedCollator.COMMON_TOP_2_ << 24; + high = 0xFFFFFFFF; + count = 0xFF - RuleBasedCollator.COMMON_TOP_2_; + } + else { + low = RuleBasedCollator.BYTE_COMMON_ << 24; //0x05000000; + high = 0x40000000; + count = 0x40 - RuleBasedCollator.BYTE_COMMON_; + } + + if (token.m_next_ != null && token.m_next_.m_strength_ == strength) { + count = token.m_next_.m_toInsert_; + } + + g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte, + g.m_ranges_); + g.m_current_ = RuleBasedCollator.BYTE_COMMON_ << 24; + + if (g.m_rangesLength_ == 0) { + throw new Exception("Internal program error"); + } + return g.m_current_; + } + + /** + * Combines 2 ce into one with respect to the argument strength + * @param ce1 first ce + * @param ce2 second ce + * @param strength strength to use + * @return combined ce + */ + private static int mergeCE(int ce1, int ce2, int strength) + { + int mask = RuleBasedCollator.CE_TERTIARY_MASK_; + if (strength == Collator.SECONDARY) { + mask = RuleBasedCollator.CE_SECONDARY_MASK_; + } + else if (strength == Collator.PRIMARY) { + mask = RuleBasedCollator.CE_PRIMARY_MASK_; + } + ce1 &= mask; + ce2 &= mask; + switch (strength) + { + case Collator.PRIMARY: + return ce1 | ce2 >> 16; + case Collator.SECONDARY: + return ce1 << 16 | ce2 << 8; + default: + return ce1 << 24 | ce2 << 16; + } + } + + /** + * @param g CEGenerator + * @param lows low gap array + * @param highs high gap array + * @param token rule token + * @param fstrength + * @exception Exception thrown when internal error occurs + */ + private int getCEGenerator(CEGenerator g, int lows[], int highs[], + CollationRuleParser.Token token, int fstrength) + throws Exception + { + int strength = token.m_strength_; + int low = lows[fstrength * 3 + strength]; + int high = highs[fstrength * 3 + strength]; + int maxbyte = (strength == Collator.TERTIARY) ? 0x3F : 0xFF; + + int count = token.m_toInsert_; + + if (low >= high && strength > Collator.PRIMARY) { + int s = strength; + while (true) { + s --; + if (lows[fstrength * 3 + s] != highs[fstrength * 3 + s]) { + if (strength == Collator.SECONDARY) { + low = RuleBasedCollator.COMMON_TOP_2_ << 24; + high = 0xFFFFFFFF; + } + else { + // low = 0x02000000; + // This needs to be checked - what if low is + // not good... + high = 0x40000000; + } + break; + } + if (s < 0) { + throw new Exception("Internal program error"); + } + } + } + if (low == 0) { + low = 0x01000000; + } + if (strength == Collator.SECONDARY) { // similar as simple + if (low >= (RuleBasedCollator.COMMON_BOTTOM_2_ <<24) + && low < (RuleBasedCollator.COMMON_TOP_2_ << 24)) { + low = RuleBasedCollator.COMMON_TOP_2_ << 24; + } + if (high > (RuleBasedCollator.COMMON_BOTTOM_2_ << 24) + && high < (RuleBasedCollator.COMMON_TOP_2_ << 24)) { + high = RuleBasedCollator.COMMON_TOP_2_ << 24; + } + if (low < (RuleBasedCollator.COMMON_BOTTOM_2_ << 24)) { + g.m_rangesLength_ = allocateWeights( + RuleBasedCollator.COMMON_TOP_2_ << 24, + high, count, maxbyte, g.m_ranges_); + g.m_current_ = RuleBasedCollator.COMMON_BOTTOM_2_; + return g.m_current_; + } + } + + g.m_rangesLength_ = allocateWeights(low, high, count, maxbyte, + g.m_ranges_); + if (g.m_rangesLength_ == 0) { + throw new Exception("Internal program error"); + } + g.m_current_ = nextWeight(g); + return g.m_current_; + } + + /** + * @param ceparts list of collation elements parts + * @param token rule token + */ + private void doCE(int ceparts[], CollationRuleParser.Token token) + { + // this one makes the table and stuff + int noofbytes[] = new int[3]; + for (int i = 0; i < 3; i ++) { + noofbytes[i] = countBytes(ceparts[i]); + } + + // Here we have to pack CEs from parts + int cei = 0; + int value = 0; + + while ((cei << 1) < noofbytes[0] || cei < noofbytes[1] + || ceiRuleBasedCollator constructor that takes the rules. + * Please see RuleBasedCollator class description for more details on the + * collation rule syntax.
+ * @see java.util.Locale + * @param rules the collation rules to build the collation table from. + * @exception ParseException thrown when argument rules have an invalid + * syntax. + * @draft 2.2 + */ + CollationRuleParser(String rules) throws ParseException + { + m_rules_ = Normalizer.decompose(rules, false); + m_source_ = new StringBuffer(m_rules_); + m_current_ = 0; + m_extra_ = new StringBuffer(); + m_extraCurrent_ = m_source_.length(); + m_variableTop_ = null; + m_parsedToken_ = new ParsedToken(); + m_hashTable_ = new Hashtable(); + m_options_ = new OptionSet(RuleBasedCollator.UCA_); + m_listHeader_ = new TokenListHeader[512]; + m_resultLength_ = 0; + assembleTokenList(); + } + + // package private inner classes ----------------------------------------- + + /** + * Collation options set + */ + static class OptionSet + { + // package private constructor --------------------------------------- + + /** + * Initializes the option set with the argument collators + * @param collator option to use + */ + OptionSet(RuleBasedCollator collator) + { + m_variableTopValue_ = collator.m_variableTopValue_; + m_isFrenchCollation_ = collator.isFrenchCollation(); + m_isAlternateHandlingShifted_ = collator.isAlternateHandling(true); + m_caseFirst_ = collator.m_caseFirst_; + m_isCaseLevel_ = collator.isCaseLevel(); + m_decomposition_ = collator.getDecomposition(); + m_strength_ = collator.getStrength(); + m_isHiragana4_ = collator.m_isHiragana4_; + } + + // package private data members -------------------------------------- + + int m_variableTopValue_; + boolean m_isFrenchCollation_; + /** + * Attribute for handling variable elements + */ + boolean m_isAlternateHandlingShifted_; + /** + * who goes first, lower case or uppercase + */ + int m_caseFirst_; + /** + * do we have an extra case level + */ + boolean m_isCaseLevel_; + /** + * attribute for normalization + */ + int m_decomposition_; + /** + * attribute for strength + */ + int m_strength_; + /** + * attribute for special Hiragana + */ + boolean m_isHiragana4_; + }; + + /** + * List of tokens used by the collation rules + */ + static class TokenListHeader + { + Token m_first_; + Token m_last_; + Token m_reset_; + boolean m_indirect_; + int m_baseCE_; + int m_baseContCE_; + int m_nextCE_; + int m_nextContCE_; + int m_previousCE_; + int m_previousContCE_; + int m_pos_[] = new int[Collator.IDENTICAL + 1]; + int m_gapsLo_[] = new int[3 * (Collator.TERTIARY + 1)]; + int m_gapsHi_[] = new int[3 * (Collator.TERTIARY + 1)]; + int m_numStr_[] = new int[3 * (Collator.TERTIARY + 1)]; + Token m_fStrToken_[] = new Token[Collator.TERTIARY + 1]; + Token m_lStrToken_[] = new Token[Collator.TERTIARY + 1]; + }; + + /** + * Token wrapper for collation rules + */ + static class Token + { + // package private data members --------------------------------------- + + int m_CE_[]; + int m_CELength_; + int m_expCE_[]; + int m_expCELength_; + int m_source_; + int m_expansion_; + int m_prefix_; + int m_strength_; + int m_toInsert_; + int m_polarity_; // 1 for <, <<, <<<, , ; and 0 for >, >>, >>> + TokenListHeader m_listHeader_; + Token m_previous_; + Token m_next_; + String m_rules_; + + // package private constructors --------------------------------------- + + Token() + { + m_CE_ = new int[128]; + m_expCE_ = new int[128]; + // TODO: this should also handle reverse + m_polarity_ = TOKEN_POLARITY_POSITIVE_; + m_next_ = null; + m_previous_ = null; + m_CELength_ = 0; + m_expCELength_ = 0; + } + + // package private methods -------------------------------------------- + + /** + * Hashcode calculation for token + * @return the hashcode + */ + public int hashCode() + { + int result = 0; + int len = (m_source_ & 0xFF000000) >> 24; + int inc = ((len - 32) / 32) + 1; + + int start = m_source_ & 0x00FFFFFF; + int limit = start + len; + + while (start < limit) { + result = (result * 37) + m_rules_.charAt(start); + start += inc; + } + return result; + } + + /** + * Equals calculation + * @param target object to compare + * @return true if target is the same as this object + */ + public boolean equals(Object target) + { + if (target == this) { + return true; + } + if (target instanceof Token) { + Token t = (Token)target; + int sstart = m_source_ & 0x00FFFFFF; + int tstart = t.m_source_ & 0x00FFFFFF; + int slimit = (m_source_ & 0xFF000000) >> 24; + int tlimit = (m_source_ & 0xFF000000) >> 24; + + int end = sstart + slimit - 1; + + if (m_source_ == 0 || t.m_source_ == 0) { + return false; + } + if (slimit != tlimit) { + return false; + } + if (m_source_ == t.m_source_) { + return true; + } + while (sstart < end + && m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) + { + ++ sstart; + ++ tstart; + } + if (m_rules_.charAt(sstart) == t.m_rules_.charAt(tstart)) { + return true; + } + } + return false; + } + }; + + // package private data member ------------------------------------------- + + /** + * Indicator that the token is resetted yet, ie & in the rules + */ + static final int TOKEN_RESET_ = 0xDEADBEEF; + + /** + * Size of the number of tokens + */ + int m_resultLength_; + /** + * List of parsed tokens + */ + TokenListHeader m_listHeader_[]; + /** + * Variable top token + */ + Token m_variableTop_; + /** + * Collation options + */ + OptionSet m_options_; + /** + * Normalized collation rules with some extra characters + */ + StringBuffer m_source_; + /** + * Hash table to keep all tokens + */ + Hashtable m_hashTable_; + + // package private method ------------------------------------------------ + + void setDefaultOptionsInCollator(RuleBasedCollator collator) + { + collator.m_defaultStrength_ = m_options_.m_strength_; + collator.m_defaultDecomposition_ = m_options_.m_decomposition_; + collator.m_defaultIsFrenchCollation_ = m_options_.m_isFrenchCollation_; + collator.m_defaultIsAlternateHandlingShifted_ + = m_options_.m_isAlternateHandlingShifted_; + collator.m_defaultIsCaseLevel_ = m_options_.m_isCaseLevel_; + collator.m_defaultCaseFirst_ = m_options_.m_caseFirst_; + collator.m_defaultIsHiragana4_ = m_options_.m_isHiragana4_; + } + + // private inner classes ------------------------------------------------- + + /** + * This is a token that has been parsed but not yet processed. Used to + * reduce the number of arguments in the parser + */ + private static class ParsedToken + { + // private constructor ---------------------------------------------- + + /** + * Empty constructor + */ + ParsedToken() + { + m_charsLen_ = 0; + m_charsOffset_ = 0; + m_extensionLen_ = 0; + m_extensionOffset_ = 0; + m_prefixLen_ = 0; + m_prefixOffset_ = 0; + m_flags_ = 0; + m_strength_ = TOKEN_UNSET_; + } + + // private data members --------------------------------------------- + + int m_strength_; + int m_charsOffset_; + int m_charsLen_; + int m_extensionOffset_; + int m_extensionLen_; + int m_prefixOffset_; + int m_prefixLen_; + char m_flags_; + char m_indirectIndex_; + }; + + /** + * Boundary wrappers + */ + private static class IndirectBoundaries + { + // package private constructor --------------------------------------- + + IndirectBoundaries(int startce, int startcontce, int limitce, + int limitcontce) + { + m_startCE_ = startce; + m_startContCE_ = startcontce; + m_limitCE_ = limitce; + m_limitContCE_ = limitcontce; + } + + // package private data members -------------------------------------- + + int m_startCE_; + int m_startContCE_; + int m_limitCE_; + int m_limitContCE_; + }; + + /** + * Collation option rule tag + */ + private static class TokenOption + { + // package private constructor --------------------------------------- + + TokenOption(String name, int attribute, String suboptions[], + int suboptionattributevalue[]) + { + m_name_ = name; + m_attribute_ = attribute; + m_subOptions_ = suboptions; + m_subOptionAttributeValues_ = suboptionattributevalue; + } + + // package private data member --------------------------------------- + + private String m_name_; + private int m_attribute_; + private String m_subOptions_[]; + private int m_subOptionAttributeValues_[]; + }; + + // private variables ----------------------------------------------------- + + /** + * Current parsed token + */ + private ParsedToken m_parsedToken_; + /** + * Collation rule + */ + private String m_rules_; + private int m_current_; + /** + * Current offset in m_source + */ + private int m_sourceLimit_; + /** + * Extra characters to keep during expansion + */ + private StringBuffer m_extra_; + /** + * Offset to m_extra_ + */ + private int m_extraCurrent_; + /** + * This is space for the extra strings that need to be unquoted during the + * parsing of the rules + */ + private static final int TOKEN_EXTRA_RULE_SPACE_SIZE_ = 2048; + /** + * Indicator that the token is not set yet + */ + private static final int TOKEN_UNSET_ = 0xFFFFFFFF; + /** + * Indicator that the rule is in the > polarity, ie everything on the + * right of the rule is less than + */ + private static final int TOKEN_POLARITY_NEGATIVE_ = 0; + /** + * Indicator that the rule is in the < polarity, ie everything on the + * right of the rule is greater than + */ + private static final int TOKEN_POLARITY_POSITIVE_ = 1; + /** + * Flag mask to determine if top is set + */ + private static final int TOKEN_TOP_MASK_ = 0x04; + /** + * Flag mask to determine if variable top is set + */ + private static final int TOKEN_VARIABLE_TOP_MASK_ = 0x08; + /** + * Flag mask to determine if a before attribute is set + */ + private static final int TOKEN_BEFORE_ = 0x03; + /** + * For use in parsing token options + */ + private static final int TOKEN_SUCCESS_MASK_ = 0x10; + + /** + * Tailoring reset top value + */ + private static final int RESET_TOP_VALUE_ = 0x9F000303; + /** + * Tailoring next top value + */ + private static final int NEXT_TOP_VALUE_ = 0xE8960303; + /** + * First primary ignorable ce + */ + private static final int FIRST_PRIMARY_IGNORABLE_ = 0x00008705; + /** + * Last primary ignorable ce + */ + private static final int LAST_PRIMARY_IGNORABLE_ = 0x0000DD05; + /** + * Last primary ignorable continuation ce + */ + private static final int LAST_PRIMARY_IGNORABLE_CONT_ = 0x0000C1C0; + /** + * First secondary ignorable ce + */ + private static final int FIRST_SECONDARY_IGNORABLE_ = 0x00000000; + /** + * Last secondary ignorable ce + */ + private static final int LAST_SECONDARY_IGNORABLE_ = 0x00000500; + /** + * First tertiary ignorable ce + */ + private static final int FIRST_TERTIARY_IGNORABLE_ = 0x00000000; + /** + * Last tertiary ignorable ce + */ + private static final int LAST_TERTIARY_IGNORABLE_ = 0x00000000; + /** + * First variable ce + */ + private static final int FIRST_VARIABLE_ = 0x05070505; + /** + * Last variable ce + */ + private static final int LAST_VARIABLE_ = 0x13CF0505; + /** + * First non variable ce + */ + private static final int FIRST_NON_VARIABLE_ = 0x16200505; + /** + * Last non variable ce + */ + private static final int LAST_NON_VARIABLE_ = 0x767C0505; + + /** + * These values are used for finding CE values for indirect positioning. + * Indirect positioning is a mechanism for allowing resets on symbolic + * values. It only works for resets and you cannot tailor indirect names. + * An indirect name can define either an anchor point or a range. An anchor + * point behaves in exactly the same way as a code point in reset would, + * except that it cannot be tailored. A range (we currently only know for + * the [top] range will explicitly set the upper bound for generated CEs, + * thus allowing for better control over how many CEs can be squeezed + * between in the range without performance penalty. In that respect, we use + * [top] for tailoring of locales that use CJK characters. Other indirect + * values are currently a pure convenience, they can be used to assure that + * the CEs will be always positioned in the same place relative to a point + * with known properties (e.g. first primary ignorable). + */ + private static final IndirectBoundaries INDIRECT_BOUNDARIES_[] = { + new IndirectBoundaries(RESET_TOP_VALUE_, 0, NEXT_TOP_VALUE_, 0), + new IndirectBoundaries(FIRST_PRIMARY_IGNORABLE_, 0, 0, 0), + new IndirectBoundaries(LAST_PRIMARY_IGNORABLE_, + LAST_PRIMARY_IGNORABLE_CONT_, 0, 0), + new IndirectBoundaries(FIRST_SECONDARY_IGNORABLE_, 0, 0, 0), + new IndirectBoundaries(LAST_SECONDARY_IGNORABLE_, 0, 0, 0), + new IndirectBoundaries(FIRST_TERTIARY_IGNORABLE_, 0, 0, 0), + new IndirectBoundaries(LAST_TERTIARY_IGNORABLE_, 0, 0, 0), + new IndirectBoundaries(FIRST_VARIABLE_, 0, 0, 0), + new IndirectBoundaries(LAST_VARIABLE_, 0, 0, 0), + new IndirectBoundaries(FIRST_NON_VARIABLE_, 0, 0, 0), + new IndirectBoundaries(LAST_NON_VARIABLE_, 0, 0, 0), + }; + + /** + * Inverse UCA constants + */ + private static final int INVERSE_SIZE_MASK_ = 0xFFF00000; + private static final int INVERSE_OFFSET_MASK_ = 0x000FFFFF; + private static final int INVERSE_SHIFT_VALUE_ = 20; + + /** + * Collation option tags + * [last variable] last variable value + * [last primary ignorable] largest CE for primary ignorable + * [last secondary ignorable] largest CE for secondary ignorable + * [last tertiary ignorable] largest CE for tertiary ignorable + * [top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8) + */ + private static final TokenOption RULES_OPTIONS_[]; + + static + { + RULES_OPTIONS_ = new TokenOption[17]; + String option[] = {"non-ignorable", "shifted"}; + int value[] = {RuleBasedCollator.AttributeValue.NON_IGNORABLE_, + RuleBasedCollator.AttributeValue.SHIFTED_}; + RULES_OPTIONS_[0] = new TokenOption("alternate", + RuleBasedCollator.Attribute.ALTERNATE_HANDLING_, + option, value); + option = new String[1]; + option[0] = "2"; + value = new int[1]; + value[0] = RuleBasedCollator.AttributeValue.ON_; + RULES_OPTIONS_[1] = new TokenOption("backwards", + RuleBasedCollator.Attribute.FRENCH_COLLATION_, + option, value); + String offonoption[] = new String[2]; + offonoption[0] = "off"; + offonoption[1] = "on"; + int offonvalue[] = new int[2]; + offonvalue[0] = RuleBasedCollator.AttributeValue.OFF_; + offonvalue[1] = RuleBasedCollator.AttributeValue.ON_; + RULES_OPTIONS_[2] = new TokenOption("caseLevel", + RuleBasedCollator.Attribute.CASE_LEVEL_, + offonoption, offonvalue); + option = new String[3]; + option[0] = "lower"; + option[1] = "upper"; + option[1] = "off"; + value = new int[3]; + value[0] = RuleBasedCollator.AttributeValue.LOWER_FIRST_; + value[1] = RuleBasedCollator.AttributeValue.UPPER_FIRST_; + value[2] = RuleBasedCollator.AttributeValue.OFF_; + RULES_OPTIONS_[3] = new TokenOption("caseFirst", + RuleBasedCollator.Attribute.CASE_FIRST_, + option, value); + RULES_OPTIONS_[4] = new TokenOption("normalization", + RuleBasedCollator.Attribute.NORMALIZATION_MODE_, + offonoption, offonvalue); + RULES_OPTIONS_[5] = new TokenOption("hiraganaQ", + RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_, + offonoption, offonvalue); + option = new String[5]; + option[0] = "1"; + option[1] = "2"; + option[2] = "3"; + option[3] = "4"; + option[4] = "I"; + value = new int[5]; + value[0] = RuleBasedCollator.AttributeValue.PRIMARY_; + value[1] = RuleBasedCollator.AttributeValue.SECONDARY_; + value[2] = RuleBasedCollator.AttributeValue.TERTIARY_; + value[3] = RuleBasedCollator.AttributeValue.QUATERNARY_; + value[4] = RuleBasedCollator.AttributeValue.IDENTICAL_; + RULES_OPTIONS_[6] = new TokenOption("strength", + RuleBasedCollator.Attribute.STRENGTH_, + option, value); + RULES_OPTIONS_[7] = new TokenOption("variable top", + RuleBasedCollator.Attribute.LIMIT_, + null, null); + RULES_OPTIONS_[8] = new TokenOption("rearrange", + RuleBasedCollator.Attribute.LIMIT_, + null, null); + option = new String[3]; + option[0] = "1"; + option[1] = "2"; + option[2] = "3"; + value = new int[3]; + value[0] = RuleBasedCollator.AttributeValue.PRIMARY_; + value[1] = RuleBasedCollator.AttributeValue.SECONDARY_; + value[2] = RuleBasedCollator.AttributeValue.TERTIARY_; + RULES_OPTIONS_[9] = new TokenOption("before", + RuleBasedCollator.Attribute.LIMIT_, + option, value); + RULES_OPTIONS_[10] = new TokenOption("top", + RuleBasedCollator.Attribute.LIMIT_, + null, null); + String firstlastoption[] = new String[5]; + firstlastoption[0] = "primary"; + firstlastoption[1] = "secondary"; + firstlastoption[2] = "tertiary"; + firstlastoption[3] = "variable"; + firstlastoption[4] = "non-ignorable"; + int firstlastvalue[] = new int[5]; + firstlastvalue[0] = RuleBasedCollator.AttributeValue.PRIMARY_; + firstlastvalue[1] = RuleBasedCollator.AttributeValue.PRIMARY_; + firstlastvalue[2] = RuleBasedCollator.AttributeValue.PRIMARY_; + firstlastvalue[3] = RuleBasedCollator.AttributeValue.PRIMARY_; + firstlastvalue[4] = RuleBasedCollator.AttributeValue.PRIMARY_; + RULES_OPTIONS_[11] = new TokenOption("first", + RuleBasedCollator.Attribute.LIMIT_, + firstlastoption, firstlastvalue); + RULES_OPTIONS_[12] = new TokenOption("last", + RuleBasedCollator.Attribute.LIMIT_, + firstlastoption, firstlastvalue); + RULES_OPTIONS_[13] = new TokenOption("undefined", + RuleBasedCollator.Attribute.LIMIT_, + null, null); + RULES_OPTIONS_[14] = new TokenOption("scriptOrder", + RuleBasedCollator.Attribute.LIMIT_, + null, null); + RULES_OPTIONS_[15] = new TokenOption("charsetname", + RuleBasedCollator.Attribute.LIMIT_, + null, null); + RULES_OPTIONS_[16] = new TokenOption("charset", + RuleBasedCollator.Attribute.LIMIT_, + null, null); + }; + + // private methods ------------------------------------------------------- + + /** + * Assembles the token list + * @param + * @exception ParseException thrown when rules syntax fails + */ + private int assembleTokenList() throws ParseException + { + Token lastToken = null; + int parseendoffset = -1; + m_parsedToken_.m_strength_ = TOKEN_UNSET_; + int sourcelimit = m_source_.length(); + int expandNext = 0; + + while (m_current_ < sourcelimit) { + m_parsedToken_.m_prefixOffset_ = 0; + // synwee todo + parseendoffset = parseNextToken(lastToken == null); + char specs = m_parsedToken_.m_flags_; + boolean variableTop = ((specs & TOKEN_VARIABLE_TOP_MASK_) != 0); + boolean top = ((specs & TOKEN_TOP_MASK_) != 0); + int lastStrength = TOKEN_UNSET_; + if (lastToken != null) { + lastStrength = lastToken.m_strength_; + } + Token key = new Token(); + key.m_source_ = m_parsedToken_.m_charsLen_ << 24 + | m_parsedToken_.m_charsOffset_; + key.m_rules_ = m_rules_; + // 4 Lookup each source in the CharsToToken map, and find a + // sourcetoken + Token sourceToken = (Token)m_hashTable_.get(key); + if (m_parsedToken_.m_strength_ != TOKEN_RESET_) { + if (lastToken == null) { + // this means that rules haven't started properly + throwParseException(m_source_.toString(), 0); + } + // 6 Otherwise (when relation != reset) + if (sourceToken == null) { + // If sourceToken is null, create new one + sourceToken = new Token(); + sourceToken.m_rules_ = m_source_.toString(); + sourceToken.m_source_ = m_parsedToken_.m_charsLen_ << 24 + | m_parsedToken_.m_charsOffset_; + sourceToken.m_prefix_ = m_parsedToken_.m_prefixLen_ << 24 + | m_parsedToken_.m_prefixOffset_; + // TODO: this should also handle reverse + sourceToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_; + sourceToken.m_next_ = null; + sourceToken.m_previous_ = null; + sourceToken.m_CELength_ = 0; + sourceToken.m_expCELength_ = 0; + m_hashTable_.put(sourceToken, sourceToken); + } + else { + // we could have fished out a reset here + if (sourceToken.m_strength_ != TOKEN_RESET_ + && lastToken != sourceToken) { + // otherwise remove sourceToken from where it was. + if (sourceToken.m_next_ != null) { + if (sourceToken.m_next_.m_strength_ + > sourceToken.m_strength_) { + sourceToken.m_next_.m_strength_ + = sourceToken.m_strength_; + } + sourceToken.m_next_.m_previous_ + = sourceToken.m_previous_; + } + else { + sourceToken.m_listHeader_.m_last_ + = sourceToken.m_previous_; + } + if (sourceToken.m_previous_ != null) { + sourceToken.m_previous_.m_next_ + = sourceToken.m_next_; + } + else { + sourceToken.m_listHeader_.m_first_ + = sourceToken.m_next_; + } + sourceToken.m_next_ = null; + sourceToken.m_previous_ = null; + } + } + sourceToken.m_strength_ = m_parsedToken_.m_strength_; + sourceToken.m_listHeader_ = lastToken.m_listHeader_; + + // 1. Find the strongest strength in each list, and set + // strongestP and strongestN accordingly in the headers. + if (lastStrength == TOKEN_RESET_ + || sourceToken.m_listHeader_.m_first_ == null) { + // If LAST is a reset insert sourceToken in the list. + if (sourceToken.m_listHeader_.m_first_ == null) { + sourceToken.m_listHeader_.m_first_ = sourceToken; + sourceToken.m_listHeader_.m_last_ = sourceToken; + } + else { // we need to find a place for us + // and we'll get in front of the same strength + if (sourceToken.m_listHeader_.m_first_.m_strength_ + <= sourceToken.m_strength_) { + sourceToken.m_next_ + = sourceToken.m_listHeader_.m_first_; + sourceToken.m_next_.m_previous_ = sourceToken; + sourceToken.m_listHeader_.m_first_ = sourceToken; + sourceToken.m_previous_ = null; + } + else { + lastToken = sourceToken.m_listHeader_.m_first_; + while (lastToken.m_next_ != null + && lastToken.m_next_.m_strength_ + > sourceToken.m_strength_) { + lastToken = lastToken.m_next_; + } + if (lastToken.m_next_ != null) { + lastToken.m_next_.m_previous_ = sourceToken; + } + else { + sourceToken.m_listHeader_.m_last_ + = sourceToken; + } + sourceToken.m_previous_ = lastToken; + sourceToken.m_next_ = lastToken.m_next_; + lastToken.m_next_ = sourceToken; + } + } + } + else { + // Otherwise (when LAST is not a reset) + // if polarity (LAST) == polarity(relation), insert + // sourceToken after LAST, otherwise insert before. + // when inserting after or before, search to the next + // position with the same strength in that direction. + // (This is called postpone insertion). + if (sourceToken != lastToken) { + if (lastToken.m_polarity_ == sourceToken.m_polarity_) { + while (lastToken.m_next_ != null + && lastToken.m_next_.m_strength_ + > sourceToken.m_strength_) { + lastToken = lastToken.m_next_; + } + sourceToken.m_previous_ = lastToken; + if (lastToken.m_next_ != null) { + lastToken.m_next_.m_previous_ = sourceToken; + } + else { + sourceToken.m_listHeader_.m_last_ = sourceToken; + } + sourceToken.m_next_ = lastToken.m_next_; + lastToken.m_next_ = sourceToken; + } + else { + while (lastToken.m_previous_ != null + && lastToken.m_previous_.m_strength_ + > sourceToken.m_strength_) { + lastToken = lastToken.m_previous_; + } + sourceToken.m_next_ = lastToken; + if (lastToken.m_previous_ != null) { + lastToken.m_previous_.m_next_ = sourceToken; + } + else { + sourceToken.m_listHeader_.m_first_ + = sourceToken; + } + sourceToken.m_previous_ = lastToken.m_previous_; + lastToken.m_previous_ = sourceToken; + } + } + else { // repeated one thing twice in rules, stay with the + // stronger strength + if (lastStrength < sourceToken.m_strength_) { + sourceToken.m_strength_ = lastStrength; + } + } + } + // if the token was a variable top, we're gonna put it in + if (variableTop == true && m_variableTop_ == null) { + variableTop = false; + m_variableTop_ = sourceToken; + } + // Treat the expansions. + // There are two types of expansions: explicit (x / y) and + // reset based propagating expansions + // (&abc * d * e <=> &ab * d / c * e / c) + // if both of them are in effect for a token, they are combined. + sourceToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24 + | m_parsedToken_.m_extensionOffset_; + if (expandNext != 0) { + if (sourceToken.m_strength_ == RuleBasedCollator.PRIMARY) { + // primary strength kills off the implicit expansion + expandNext = 0; + } + else if (sourceToken.m_expansion_ == 0) { + // if there is no expansion, implicit is just added to + // the token + sourceToken.m_expansion_ = expandNext; + } + else { + // there is both explicit and implicit expansion. + // We need to make a combination + m_extra_.delete(0, m_extra_.length()); + int start = expandNext & 0xFFFFFF; + m_extra_.append(m_source_.substring(start, + start + expandNext >>> 24)); + start = m_parsedToken_.m_extensionOffset_; + m_extra_.append(m_source_.substring(start, + start + m_parsedToken_.m_extensionLen_)); + sourceToken.m_expansion_ = ((expandNext >>> 24) + + m_parsedToken_.m_extensionLen_) << 24 + | m_extraCurrent_; + m_extraCurrent_ += (expandNext >> 24) + + m_parsedToken_.m_extensionLen_; + } + } + } + else { + if (lastToken != null && lastStrength == TOKEN_RESET_) { + // if the previous token was also a reset, this means that + // we have two consecutive resets and we want to remove the + // previous one if empty + if (m_listHeader_[m_resultLength_ - 1].m_first_ == null) { + m_resultLength_ --; + } + } + if (sourceToken == null) { + // this is a reset, but it might still be somewhere in the + // tailoring, in shorter form + int searchCharsLen = m_parsedToken_.m_charsLen_; + while (searchCharsLen > 1 && sourceToken == null) { + searchCharsLen --; + // key = searchCharsLen << 24 | charsOffset; + Token tokenkey = new Token(); + tokenkey.m_source_ = searchCharsLen << 24 + | m_parsedToken_.m_charsOffset_; + tokenkey.m_rules_ = m_source_.toString(); + sourceToken = (Token)m_hashTable_.get(tokenkey); + } + if (sourceToken != null) { + expandNext = (m_parsedToken_.m_charsLen_ + - searchCharsLen) << 24 + | (m_parsedToken_.m_charsOffset_ + + searchCharsLen); + } + } + if ((specs & TOKEN_BEFORE_) != 0 && top == false) { + // we're doing before & there is no indirection + int strength = (specs & TOKEN_BEFORE_) - 1; + if (sourceToken != null + && sourceToken.m_strength_ != TOKEN_RESET_) { + // this is a before that is already ordered in the UCA + // - so we need to get the previous with good strength + while (sourceToken.m_strength_ > strength + && sourceToken.m_previous_ != null) { + sourceToken = sourceToken.m_previous_; + } + // here, either we hit the strength or NULL + if (sourceToken.m_strength_ == strength) { + if (sourceToken.m_previous_ != null) { + sourceToken = sourceToken.m_previous_; + } + else { // start of list + sourceToken + = sourceToken.m_listHeader_.m_reset_; + } + } + else { // we hit NULL, we should be doing the else part + sourceToken = sourceToken.m_listHeader_.m_reset_; + sourceToken = getVirginBefore(sourceToken, + strength); + } + } + else { + sourceToken = getVirginBefore(sourceToken, strength); + } + } + // 5 If the relation is a reset: + // If sourceToken is null + // Create new list, create new sourceToken, make the baseCE + // from source, put the sourceToken in ListHeader of the new + // list + if (sourceToken == null) { + m_listHeader_[m_resultLength_] = new TokenListHeader(); + // 3 Consider each item: relation, source, and expansion: + // e.g. ...< x / y ... + // First convert all expansions into normal form. + // Examples: + // If "xy" doesn't occur earlier in the list or in the UCA, + // convert &xy * c * d * ... into &x * c/y * d * ... + // Note: reset values can never have expansions, although + // they can cause the very next item to have one. They may + // be contractions, if they are found earlier in the list. + if (top == false) { + CollationElementIterator coleiter + = RuleBasedCollator.UCA_.getCollationElementIterator( + m_source_.substring(m_parsedToken_.m_charsOffset_, + m_parsedToken_.m_charsOffset_ + + m_parsedToken_.m_charsLen_)); + + int CE = coleiter.next(); + int expand = coleiter.getOffset(); + int SecondCE = coleiter.next(); + + m_listHeader_[m_resultLength_].m_baseCE_ + = CE & 0xFFFFFF3F; + if (RuleBasedCollator.isContinuation(SecondCE)) { + m_listHeader_[m_resultLength_].m_baseContCE_ + = SecondCE; + } + else { + m_listHeader_[m_resultLength_].m_baseContCE_ = 0; + } + m_listHeader_[m_resultLength_].m_nextCE_ = 0; + m_listHeader_[m_resultLength_].m_nextContCE_ = 0; + m_listHeader_[m_resultLength_].m_previousCE_ = 0; + m_listHeader_[m_resultLength_].m_previousContCE_ = 0; + m_listHeader_[m_resultLength_].m_indirect_ = false; + sourceToken = new Token(); + expandNext = initAReset(expand, sourceToken); + } + else { // top == TRUE + top = false; + m_listHeader_[m_resultLength_].m_previousCE_ = 0; + m_listHeader_[m_resultLength_].m_previousContCE_ = 0; + m_listHeader_[m_resultLength_].m_indirect_ = true; + IndirectBoundaries ib = INDIRECT_BOUNDARIES_[ + m_parsedToken_.m_indirectIndex_]; + if ((specs & TOKEN_BEFORE_) == 0) { + // indirect without before, just use the supplied + // values + m_listHeader_[m_resultLength_].m_baseCE_ + = ib.m_startCE_; + m_listHeader_[m_resultLength_].m_baseContCE_ + = ib.m_startContCE_; + m_listHeader_[m_resultLength_].m_nextCE_ + = ib.m_limitCE_; + m_listHeader_[m_resultLength_].m_nextContCE_ + = ib.m_limitContCE_; + } + else { // there was a before + // we need to do slightly more work. we need to get + // the baseCE using the inverse UCA & getPrevious. + // The next bound is not set, and will be decided + // in ucol_bld + int strength = (specs & TOKEN_BEFORE_) - 1; + int baseCE = ib.m_startCE_; + int baseContCE = ib.m_startContCE_;//&0xFFFFFF3F; + CollationParsedRuleBuilder.InverseUCA invuca + = CollationParsedRuleBuilder.INVERSE_UCA_; + int ce[] = {0, 0, 0}; + invuca.getInversePrevCE(baseCE, baseContCE, + strength, ce); + m_listHeader_[m_resultLength_].m_baseCE_ = ce[1]; + m_listHeader_[m_resultLength_].m_baseContCE_ + = ce[2]; + m_listHeader_[m_resultLength_].m_nextCE_ = 0; + m_listHeader_[m_resultLength_].m_nextContCE_ = 0; + } + sourceToken = new Token(); + expandNext = initAReset(0, sourceToken); + } + } + else { // reset to something already in rules + top = false; + } + } + // 7 After all this, set LAST to point to sourceToken, and goto + // step 3. + lastToken = sourceToken; + } + + if (m_resultLength_ > 0 + && m_listHeader_[m_resultLength_ - 1].m_first_ == null) { + m_resultLength_ --; + } + return m_resultLength_; + } + + /** + * Formats and throws a ParseException + * @param rules collation rule that failed + * @param offset failed offset in rules + * @throws ParseException with failure information + */ + private static final void throwParseException(String rules, int offset) + throws ParseException + { + // for pre-context + String precontext = rules.substring(0, offset); + String postcontext = rules.substring(offset, rules.length()); + StringBuffer error = new StringBuffer( + "Parse error occurred in rule at offset "); + error.append(offset); + error.append("\n after the prefix \""); + error.append(precontext); + error.append("\" before the suffix \""); + error.append(postcontext); + throw new ParseException(error.toString(), offset); + } + + /** + * Getting the next token + * @param startofrules flag indicating if we are at the start of rules + * @return the offset of the rules + * @exception ParseException thrown when rule parsing fails + */ + private int parseNextToken(boolean startofrules) throws ParseException + { + // parsing part + boolean variabletop = false; + boolean top = false; + boolean inchars = true; + boolean inquote = false; + boolean wasinquote = false; + byte before = 0; + boolean isescaped = false; + int newcharslen = 0, newextensionlen = 0; + int charsoffset = 0, extensionoffset = 0; + int newstrength = TOKEN_UNSET_; + + m_parsedToken_.m_prefixOffset_ = 0; + m_parsedToken_.m_prefixLen_ = 0; + m_parsedToken_.m_indirectIndex_ = 0; + + int limit = m_source_.length(); + while (m_current_ < limit) { + char ch = m_source_.charAt(m_current_); + if (inquote) { + if (ch == 0x0027) { // '\'' + inquote = false; + } + else { + if ((newcharslen == 0) || inchars) { + if (newcharslen == 0) { + charsoffset = m_extraCurrent_; + } + newcharslen ++; + } + else { + if (newextensionlen == 0) { + extensionoffset = m_extraCurrent_; + } + newextensionlen ++; + } + } + } + else if (isescaped) { + isescaped = false; + if (newstrength == TOKEN_UNSET_) { + throwParseException(m_rules_, m_current_); + } + if (ch != 0 && m_current_ != limit) { + if (inchars) { + if (newcharslen == 0) { + charsoffset = m_current_; + } + newcharslen ++; + } + else { + if (newextensionlen == 0) { + extensionoffset = m_current_; + } + newextensionlen ++; + } + } + } + else { + // Sets the strength for this entry + switch (ch) { + case 0x003D : // '=' + if (newstrength != TOKEN_UNSET_) { + return doEndParseNextToken(newstrength, newcharslen, + top, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + // if we start with strength, we'll reset to top + if (startofrules == true) { + return doEndParseNextToken(TOKEN_RESET_, + newcharslen, + true, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + newstrength = Collator.IDENTICAL; + break; + case 0x002C : // ',' + if (newstrength != TOKEN_UNSET_) { + return doEndParseNextToken(newstrength, newcharslen, + top, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + // if we start with strength, we'll reset to top + if (startofrules == true) { + return doEndParseNextToken(TOKEN_RESET_, + newcharslen, + true, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + newstrength = Collator.TERTIARY; + break; + case 0x003B : // ';' + if (newstrength != TOKEN_UNSET_) { + return doEndParseNextToken(newstrength, newcharslen, + true, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + // if we start with strength, we'll reset to top + if (startofrules == true) { + return doEndParseNextToken(TOKEN_RESET_, + newcharslen, + true, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + newstrength = Collator.SECONDARY; + break; + case 0x003C : // '<' + if (newstrength != TOKEN_UNSET_) { + return doEndParseNextToken(newstrength, newcharslen, + top, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + // if we start with strength, we'll reset to top + if (startofrules == true) { + return doEndParseNextToken(TOKEN_RESET_, + newcharslen, true, + charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + // before this, do a scan to verify whether this is + // another strength + if (m_source_.charAt(m_current_ + 1) == 0x003C) { + m_current_ ++; + if (m_source_.charAt(m_current_ + 1) == 0x003C) { + m_current_ ++; // three in a row! + newstrength = Collator.TERTIARY; + } + else { // two in a row + newstrength = Collator.SECONDARY; + } + } + else { // just one + newstrength = Collator.PRIMARY; + } + break; + case 0x0026 : // '&' + if (newstrength != TOKEN_UNSET_) { + return doEndParseNextToken(newstrength, newcharslen, + top, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + newstrength = TOKEN_RESET_; // PatternEntry::RESET = 0 + break; + case 0x005b : // '[' + // options - read an option, analyze it + int optionend = m_rules_.indexOf(0x005d, m_current_); + if (optionend != -1) { // ']' + byte result = readAndSetOption(optionend); + m_current_ = optionend; + if ((result & TOKEN_TOP_MASK_) != 0) { + if (newstrength == TOKEN_RESET_) { + charsoffset = m_extraCurrent_; + m_source_.append((char)0xFFFE); + IndirectBoundaries ib = + INDIRECT_BOUNDARIES_[ + m_parsedToken_.m_indirectIndex_]; + m_source_.append((char)(ib.m_startCE_ + >> 16)); + m_source_.append((char)(ib.m_startCE_ + & 0xFFFF)); + m_extraCurrent_ += 3; + m_current_ ++; + return doEndParseNextToken(newstrength, + 3, true, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + else { + throwParseException(m_rules_, m_current_); + } + } + else if ((result & TOKEN_VARIABLE_TOP_MASK_) != 0) { + if (newstrength != TOKEN_RESET_ + && newstrength != TOKEN_UNSET_) { + charsoffset = m_extraCurrent_; + m_source_.append((char)0xFFFF); + m_extraCurrent_ ++; + m_current_ ++; + return doEndParseNextToken(newstrength, + 1, true, charsoffset, + extensionoffset, + newextensionlen, + variabletop, before); + } + else { + throwParseException(m_rules_, m_current_); + } + } + else if ((result & TOKEN_BEFORE_) != 0){ + if (newstrength == TOKEN_RESET_) { + before = (byte)(result & TOKEN_BEFORE_); + } + else { + throwParseException(m_rules_, m_current_); + } + } + } + break; + // Ignore the white spaces + case 0x0009 : // '\t' + case 0x000C : // '\f' + case 0x000D : // '\r' + case 0x000A : // '\n' + case 0x0020 : // ' ' + break; // skip whitespace TODO use Unicode + case 0x002F : // '/' + wasinquote = false; // if we were copying source + // characters, we want to stop now + inchars = false; // we're now processing expansion + break; + case 0x005C : // back slash for escaped chars + isescaped = true; + break; + // found a quote, we're gonna start copying + case 0x0027 : //'\'' + if (newstrength == TOKEN_UNSET_) { + // quote is illegal until we have a strength + throwParseException(m_rules_, m_current_); + } + inquote = true; + if (inchars) { // we're doing characters + if (wasinquote == false) { + charsoffset = m_extraCurrent_; + } + if (newcharslen != 0) { + m_source_.append(m_source_.substring( + m_current_ - newcharslen, + m_current_)); + m_extraCurrent_ += newcharslen; + } + newcharslen ++; + } + else { // we're doing an expansion + if (wasinquote == false) { + extensionoffset = m_extraCurrent_; + } + if (newextensionlen != 0) { + m_source_.append(m_source_.substring( + m_current_ - newextensionlen, + m_current_)); + m_extraCurrent_ += newextensionlen; + } + newextensionlen ++; + } + wasinquote = true; + m_current_ ++; + ch = m_source_.charAt(m_current_); + if (ch == 0x0027) { // copy the double quote + m_source_.append(ch); + m_extraCurrent_ ++; + inquote = false; + } + break; + // '@' is french only if the strength is not currently set + // if it is, it's just a regular character in collation + case 0x0040 : // '@' + if (newstrength == TOKEN_UNSET_) { + m_options_.m_isFrenchCollation_ = true; + break; + } + case 0x007C : //| + // this means we have actually been reading prefix part + // we want to store read characters to the prefix part + // and continue reading the characters (proper way + // would be to restart reading the chars, but in that + // case we would have to complicate the token hasher, + // which I do not intend to play with. Instead, we will + // do prefixes when prefixes are due (before adding the + // elements). + m_parsedToken_.m_prefixOffset_ = charsoffset; + m_parsedToken_.m_prefixLen_ = newcharslen; + if (inchars) { // we're doing characters + if (wasinquote == false) { + charsoffset = m_extraCurrent_; + } + if (newcharslen != 0) { + m_source_.append(m_source_.substring(m_current_, + newcharslen)); + m_extraCurrent_ += newcharslen; + } + newcharslen ++; + } + wasinquote = true; + m_current_ ++; + ch = m_source_.charAt(m_current_); + break; + default : + if (newstrength == TOKEN_UNSET_) { + throwParseException(m_rules_, m_current_); + } + if (isSpecialChar(ch) && (inquote == false)) { + throwParseException(m_rules_, m_current_); + } + if (ch == 0x0000 && m_current_ + 1 == limit) { + break; + } + if (inchars) { + if (newcharslen == 0) { + charsoffset = m_current_; + } + newcharslen++; + } + else { + if (newextensionlen == 0) { + extensionoffset = m_current_; + } + newextensionlen ++; + } + break; + } + } + if (wasinquote) { + if (ch != 0x27) { + m_source_.append(ch); + m_extraCurrent_ ++; + } + } + m_current_ ++; + } + return doEndParseNextToken(newstrength, newcharslen, top, charsoffset, + extensionoffset, newextensionlen, + variabletop, before); + } + + /** + * End the next parse token + * @param newstrength new strength + * @return offset in rules, -1 for end of rules + */ + private int doEndParseNextToken(int newstrength, int newcharslen, + boolean top, int charsoffset, + int extensionoffset, int newextensionlen, + boolean variabletop, int before) + throws ParseException + { + boolean wasinquote = false; + if (newstrength == TOKEN_UNSET_) { + return -1; + } + if (newcharslen == 0 && top == false) { + throwParseException(m_rules_, m_current_); + return -1; + } + + m_parsedToken_.m_strength_ = newstrength; + m_parsedToken_.m_charsOffset_ = charsoffset; + m_parsedToken_.m_charsLen_ = newcharslen; + m_parsedToken_.m_extensionOffset_ = extensionoffset; + m_parsedToken_.m_extensionLen_ = newextensionlen; + m_parsedToken_.m_flags_ = (char)((TOKEN_VARIABLE_TOP_MASK_ * + (variabletop ? 1 : 0)) + | (TOKEN_TOP_MASK_ * (top ? 1 : 0)) | before); + return m_current_; + } + + /** + * Token before this element + * @param sourcetoken + * @param strength collation strength + * @return the token before source token + * @exception ParseException thrown when rules have the wrong syntax + */ + private Token getVirginBefore(Token sourcetoken, int strength) + throws ParseException + { + // this is a virgin before - we need to fish the anchor from the UCA + StringBuffer str = new StringBuffer(); + if (sourcetoken != null) { + str.append(m_source_.charAt(sourcetoken.m_source_ & 0xFFFFFF)); + } + else { + str.append(m_source_.charAt(m_parsedToken_.m_charsOffset_)); + } + CollationElementIterator coleiter = + RuleBasedCollator.UCA_.getCollationElementIterator( + str.toString()); + int basece = coleiter.next() & 0xFFFFFF3F; + int basecontce = coleiter.next(); + if (basecontce == CollationElementIterator.NULLORDER) { + basecontce = 0; + } + int ce[] = new int[3]; // invpos, first ce and second ce + CollationParsedRuleBuilder.INVERSE_UCA_.getInversePrevCE(basece, + basecontce, strength, ce); + int invpos = ce[0]; + int ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_table_[3 * invpos + + 2]; + if ((ch & INVERSE_SIZE_MASK_) != 0) { + int offset = ch & INVERSE_OFFSET_MASK_; + ch = CollationParsedRuleBuilder.INVERSE_UCA_.m_continuations_[ + offset]; + } + m_source_.append((char)ch); + m_extraCurrent_ ++; + m_parsedToken_.m_charsOffset_ = m_extraCurrent_ - 1; + m_parsedToken_.m_charsLen_ = 1; + + // We got an UCA before. However, this might have been tailored. + // example: + // &\u30ca = \u306a + // &[before 3]\u306a<<<\u306a|\u309d + + Token key = new Token(); + key.m_source_ = (m_parsedToken_.m_charsLen_ << 24) + | m_parsedToken_.m_charsOffset_; + key.m_rules_ = m_rules_; + sourcetoken = (Token)m_hashTable_.get(key); + + // if we found a tailored thing, we have to use the UCA value and + // construct a new reset token with constructed name + if (sourcetoken != null && sourcetoken.m_strength_ != TOKEN_RESET_) { + // character to which we want to anchor is already tailored. + // We need to construct a new token which will be the anchor point + m_source_.replace(m_extraCurrent_ - 1, m_extraCurrent_, "\uFFFE"); + m_source_.append(ch); + m_extraCurrent_ ++; + m_parsedToken_.m_charsLen_ ++; + m_listHeader_[m_resultLength_].m_baseCE_ = ce[0] & 0xFFFFFF3F; + if (RuleBasedCollator.isContinuation(ce[1])) { + m_listHeader_[m_resultLength_].m_baseContCE_ = ce[1]; + } + else { + m_listHeader_[m_resultLength_].m_baseContCE_ = 0; + } + m_listHeader_[m_resultLength_].m_nextCE_ = 0; + m_listHeader_[m_resultLength_].m_nextContCE_ = 0; + m_listHeader_[m_resultLength_].m_previousCE_ = 0; + m_listHeader_[m_resultLength_].m_previousContCE_ = 0; + m_listHeader_[m_resultLength_].m_indirect_ = false; + sourcetoken = new Token(); + initAReset(-1, sourcetoken); + } + return sourcetoken; + } + + /** + * Processing Description. + * 1. Build a m_listHeader_. Each list has a header, which contains two lists + * (positive and negative), a reset token, a baseCE, nextCE, and + * previousCE. The lists and reset may be null. + * 2. As you process, you keep a LAST pointer that points to the last token + * you handled. + * @param expand string offset, -1 for null strings + * @param targetToken tken to update + * @return expandnext offset + * @throws ParseException thrown when rules syntax failed + */ + private int initAReset(int expand, Token targetToken) throws ParseException + { + // do the reset thing + targetToken.m_rules_ = m_rules_; + targetToken.m_source_ = m_parsedToken_.m_charsLen_ << 24 + | m_parsedToken_.m_charsOffset_; + targetToken.m_expansion_ = m_parsedToken_.m_extensionLen_ << 24 + | m_parsedToken_.m_extensionOffset_; + if (m_parsedToken_.m_prefixOffset_ != 0) { + throwParseException(m_rules_, m_parsedToken_.m_charsOffset_ - 1); + } + + targetToken.m_prefix_ = 0; + // TODO: this should also handle reverse + targetToken.m_polarity_ = TOKEN_POLARITY_POSITIVE_; + targetToken.m_strength_ = TOKEN_RESET_; + targetToken.m_next_ = null; + targetToken.m_previous_ = null; + targetToken.m_CELength_ = 0; + targetToken.m_expCELength_ = 0; + targetToken.m_listHeader_ = m_listHeader_[m_resultLength_]; + m_listHeader_[m_resultLength_].m_first_ = null; + m_listHeader_[m_resultLength_].m_last_ = null; + m_listHeader_[m_resultLength_].m_first_ = null; + m_listHeader_[m_resultLength_].m_last_ = null; + m_listHeader_[m_resultLength_].m_reset_ = targetToken; + + /* 3 Consider each item: relation, source, and expansion: + * e.g. ...< x / y ... + * First convert all expansions into normal form. Examples: + * If "xy" doesn't occur earlier in the list or in the UCA, convert + * &xy * c * d * ... into &x * c/y * d * ... + * Note: reset values can never have expansions, although they can + * cause the very next item to have one. They may be contractions, if + * they are found earlier in the list. + */ + int result = 0; + if (expand != -1) { + // check to see if there is an expansion + if (m_parsedToken_.m_charsLen_ > 1) { + targetToken.m_source_ = ((expand + - m_parsedToken_.m_charsOffset_ ) + << 24) + | m_parsedToken_.m_charsOffset_; + result = ((m_parsedToken_.m_charsLen_ + + m_parsedToken_.m_charsOffset_ - expand) << 24) + | expand; + } + } + + m_resultLength_ ++; + m_hashTable_.put(targetToken, targetToken); + return result; + } + + /** + * Checks if an character is special + * @param ch character to test + * @return true if the character is special + */ + private static final boolean isSpecialChar(char ch) + { + return (ch <= 0x002F && ch >= 0x0020) || (ch <= 0x003F && ch >= 0x003A) + || (ch <= 0x0060 && ch >= 0x005B) + || (ch <= 0x007E && ch >= 0x007D) || ch == 0x007B; + } + + /** + * Reads and set collation options + * @param optionend offset to the end of the option in rules + * @return TOKEN_SUCCESS if option is set correct, 0 otherwise + * @exception ParseException thrown when options in rules are wrong + */ + private byte readAndSetOption(int optionend) throws ParseException + { + int start = m_current_ + 1; // skip opening '[' + int i = 0; + boolean foundoption = false; + int optionarg = 0; + while (i < RULES_OPTIONS_.length) { + String option = RULES_OPTIONS_[i].m_name_; + int optionlength = option.length(); + if (m_rules_.length() < start + optionlength) { + throwParseException(m_rules_, start); + } + if (option.equalsIgnoreCase(m_rules_.substring(start, start + + optionlength))) { + foundoption = true; + if (optionend - start > optionlength) { + optionarg = start + optionlength + 1; + // start of the options, skip space + while (UCharacter.isWhitespace(m_rules_.charAt(optionarg))) + { // eat whitespace + optionarg ++; + } + } + break; + } + i ++; + } + + if (!foundoption) { + throwParseException(m_rules_, start); + } + + if (i < 7) { + if (optionarg != 0) { + for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; + j ++) { + String subname = RULES_OPTIONS_[i].m_subOptions_[j]; + int size = optionarg + subname.length(); + if (m_rules_.length() > size && subname.equalsIgnoreCase( + m_rules_.substring(optionarg, + subname.length()))) { + setOptions(m_options_, RULES_OPTIONS_[i].m_attribute_, + RULES_OPTIONS_[i].m_subOptionAttributeValues_[j]); + return TOKEN_SUCCESS_MASK_; + } + } + } + throwParseException(m_rules_, optionarg); + } + else if (i == 7) { // variable top + return TOKEN_SUCCESS_MASK_ | TOKEN_VARIABLE_TOP_MASK_; + } + else if (i == 8) { // rearange + return TOKEN_SUCCESS_MASK_; + } + else if (i == 9) { // before + if (optionarg != 0) { + for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; + j ++) { + String subname = RULES_OPTIONS_[i].m_subOptions_[j]; + int size = optionarg + subname.length(); + if (m_rules_.length() > size && subname.equalsIgnoreCase( + m_rules_.substring(optionarg, + subname.length()))) { + return (byte)(TOKEN_SUCCESS_MASK_ + | RULES_OPTIONS_[i].m_subOptionAttributeValues_[j] + + 1); + } + } + } + throwParseException(m_rules_, optionarg); + } + else if (i == 10) { // top, we are going to have an array with + // structures of limit CEs index to this array will be + // src->parsedToken.indirectIndex + m_parsedToken_.m_indirectIndex_ = 0; + return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_; + } + else if (i < 13) { // first, last + for (int j = 0; j < RULES_OPTIONS_[i].m_subOptions_.length; j ++) { + String subname = RULES_OPTIONS_[i].m_subOptions_[j]; + int size = optionarg + subname.length(); + if (m_rules_.length() > size && subname.equalsIgnoreCase( + m_rules_.substring(optionarg, + subname.length()))) { + m_parsedToken_.m_indirectIndex_ = (char)(i - 10 + j << 1); + return TOKEN_SUCCESS_MASK_ | TOKEN_TOP_MASK_; + } + } + throwParseException(m_rules_, optionarg); + } + else { + throwParseException(m_rules_, optionarg); + } + return TOKEN_SUCCESS_MASK_; // we will never reach here. + } + + /** + * Set collation option + * @param optionset option set to set + * @param attribute type to set + * @param value attribute value + */ + private void setOptions(OptionSet optionset, int attribute, int value) + { + switch (attribute) { + case RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_ : + optionset.m_isHiragana4_ + = (value == RuleBasedCollator.AttributeValue.ON_); + break; + case RuleBasedCollator.Attribute.FRENCH_COLLATION_ : + optionset.m_isFrenchCollation_ + = (value == RuleBasedCollator.AttributeValue.ON_); + break; + case RuleBasedCollator.Attribute.ALTERNATE_HANDLING_ : + optionset.m_isAlternateHandlingShifted_ + = (value + == RuleBasedCollator.AttributeValue.SHIFTED_); + break; + case RuleBasedCollator.Attribute.CASE_FIRST_ : + optionset.m_caseFirst_ = value; + break; + case RuleBasedCollator.Attribute.CASE_LEVEL_ : + optionset.m_isCaseLevel_ + = (value == RuleBasedCollator.AttributeValue.ON_); + break; + case RuleBasedCollator.Attribute.NORMALIZATION_MODE_ : + if (value == RuleBasedCollator.AttributeValue.ON_) { + value = Collator.CANONICAL_DECOMPOSITION; + } + optionset.m_decomposition_ = value; + break; + case RuleBasedCollator.Attribute.STRENGTH_ : + optionset.m_strength_ = value; + break; + default : + break; + } + } +} diff --git a/icu4j/src/com/ibm/icu/text/Collator.java b/icu4j/src/com/ibm/icu/text/Collator.java index d49af43ef18..ea8f3b6a736 100755 --- a/icu4j/src/com/ibm/icu/text/Collator.java +++ b/icu4j/src/com/ibm/icu/text/Collator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Collator.java,v $ -* $Date: 2002/05/20 23:43:01 $ -* $Revision: 1.6 $ +* $Date: 2002/06/21 23:56:44 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -15,57 +15,103 @@ package com.ibm.icu.text; import java.util.Locale; /** -*The Collator class performs locale-sensitive String comparison. -* You use this class to build searching and sorting routines for natural -* language text.
-*Collator is an abstract base class. Subclasses implement specific -* collation strategies. One subclass, RuleBasedCollator, is currently -* provided and is applicable to a wide set of languages. Other subclasses -* may be created to handle more specialized needs.
-*Like other locale-sensitive classes, you can use the static factory -* method, getInstance, to obtain the appropriate Collator object for a given -* locale. You will only need to look at the subclasses of Collator if you need -* to understand the details of a particular collation strategy or if you need -* to modify that strategy.
-*The following example shows how to compare two strings using the Collator -* for the default locale. +*
+* Collator is an abstract base class, its subclasses performs +* locale-sensitive String comparison. A concrete subclass, RuleBasedCollator, +* is provided and it allows customization of the collation ordering by the use +* of rule sets. +*
+*+* Following the +* Unicode Consortium's specifications for +* the +* Unicode Collation Algorithm (UCA), there are +* 5 different levels of strength used in comparisons. +*
+* For more information about the collation service see the +* users +* guide. +*
+*+* Examples of use *
-* // Compare two strings in the default locale -* Collator myCollator = Collator.getInstance(); -* if (myCollator.compare("abc", "ABC") < 0) { -* System.out.println("abc is less than ABC"); -* } -* else { -* System.out.println("abc is greater than or equal to ABC"); -* } -*-*
You can set a Collator
's strength property to
-* determine the level of difference considered significant in comparisons.
-* Four strengths are provided: PRIMARY
, SECONDARY
,
-* TERTIARY
, and IDENTICAL
. The exact assignment of
-* strengths to language features is locale dependant. For example, in Czech,
-* "e" and "f" are considered primary differences, while "e" and "\u00EA" are
-* secondary differences, "e" and "E" are tertiary differences and "e" and "e"
-* are identical. The following shows how both case and accents could be
-* ignored for US English.
-* //Get the Collator for US English and set its strength to PRIMARY +* // Get the Collator for US English and set its strength to PRIMARY * Collator usCollator = Collator.getInstance(Locale.US); * usCollator.setStrength(Collator.PRIMARY); * if (usCollator.compare("abc", "ABC") == 0) { * System.out.println("Strings are equivalent"); * } +* +* The following example shows how to compare two strings using the Collator +* for the default locale. +* // Compare two strings in the default locale +* Collator myCollator = Collator.getInstance(); +* myCollator.setDecomposition(NO_DECOMPOSITION); +* if (myCollator.compare("à\u0325", "a\u0325̀") != 0) { +* System.out.println("à\u0325 is not equals to a\u0325̀ without decomposition"); +* myCollator.setDecomposition(CANONICAL_DECOMPOSITION); +* if (myCollator.compare("à\u0325", "a\u0325̀") != 0) { +* System.out.println("Error: à\u0325 should be equals to a\u0325̀ with decomposition"); +* } +* else { +* System.out.println("à\u0325 is equals to a\u0325̀ with decomposition"); +* } +* } +* else { +* System.out.println("Error: à\u0325 should be not equals to a\u0325̀ without decomposition"); +* } *-*
For comparing Strings exactly once, the compare method provides the best -* performance. When sorting a list of Strings however, it is generally -* necessary to compare each String multiple times. In this case, -* CollationKeys provide better performance. The CollationKey class converts a -* String to a series of bits that can be compared bitwise against other -* CollationKeys. A CollationKey is created by a Collator object for a given -* String.
-*Note: CollationKeys from different Collators can not be compared. See the -* class description for CollationKey for an example using CollationKeys. *
+* @see RuleBasedCollator +* @see CollationKey * @author Syn Wee Quek * @since release 2.2, April 18 2002 * @draft 2.2 @@ -76,92 +122,92 @@ public abstract class Collator // public data members --------------------------------------------------- /** - * Collator strength value. When set, only PRIMARY differences are - * considered significant during comparison. The assignment of strengths - * to language features is locale dependant. A common example is for - * different base letters ("a" vs "b") to be considered a PRIMARY - * difference. + * Strongest collator strength value. Typically, used to denote differences + * between base characters. + * See class documentation for more explanation. * @see #setStrength * @see #getStrength * @draft 2.2 */ - public final static int PRIMARY - = RuleBasedCollator.AttributeValue.PRIMARY_; + public final static int PRIMARY = 0; /** - * Collator strength value. When set, only SECONDARY and above - * differences are considered significant during comparison. The - * assignment of strengths to language features is locale dependant. A - * common example is for different accented forms of the same base letter - * ("a" vs "\u00E4") to be considered a SECONDARY difference. + * Second level collator strength value. + * Accents in the characters are considered secondary differences. + * Other differences between letters can also be considered secondary + * differences, depending on the language. + * See class documentation for more explanation. * @see #setStrength * @see #getStrength * @draft 2.2 */ - public final static int SECONDARY - = RuleBasedCollator.AttributeValue.SECONDARY_; + public final static int SECONDARY = 1; /** - * Collator strength value. When set, only TERTIARY and above differences - * are considered significant during comparison. The assignment of - * strengths to language features is locale dependant. A common example is - * for case differences ("a" vs "A") to be considered a TERTIARY - * difference. + * Third level collator strength value. + * Upper and lower case differences in characters are distinguished at this + * strength level. In addition, a variant of a letter differs from the base + * form on the tertiary level. + * See class documentation for more explanation. * @see #setStrength * @see #getStrength * @draft 2.2 */ - public final static int TERTIARY - = RuleBasedCollator.AttributeValue.TERTIARY_; - + public final static int TERTIARY = 2; /** - * Collator strength value. When set, only QUARTENARY and above differences - * are considered significant during comparison. The assignment of - * strengths to language features is locale dependant. - * difference. + * Fourth level collator strength value. + * When punctuation is ignored + * + * (see Ignoring Punctuations in the user guide) at PRIMARY to TERTIARY + * strength, an additional strength level can + * be used to distinguish words with and without punctuation + * See class documentation for more explanation. * @see #setStrength * @see #getStrength * @draft 2.2 */ - public final static int QUATERNARY - = RuleBasedCollator.AttributeValue.QUATERNARY_; - + public final static int QUATERNARY = 3; /** - *Collator strength value. When set, all differences are considered - * significant during comparison. The assignment of strengths to language - * features is locale dependant. A common example is for control - * characters ("\u0001" vs "\u0002") to be considered equal at - * the PRIMARY, SECONDARY, and TERTIARY levels but different at the - * IDENTICAL level. Additionally, differences between pre-composed - * accents such as "\u00C0" (A-grave) and combining accents such as - * "A\u0300" (A, combining-grave) will be considered significant at - * the tertiary level if decomposition is set to NO_DECOMPOSITION. + *
+ * Smallest Collator strength value. When all other strengths are equal, + * the IDENTICAL strength is used as a tiebreaker. The Unicode code point + * values of the NFD form of each string are compared, just in case there + * is no difference. + * See class documentation for more explanation. + *
+ *+ * Note this value is different from JDK's *
- *Note this value is different from JDK's
* @draft 2.2 */ - public final static int IDENTICAL - = RuleBasedCollator.AttributeValue.IDENTICAL_; + public final static int IDENTICAL = 15; /** - *Decomposition mode value. With NO_DECOMPOSITION set, accented - * characters will not be decomposed for collation. This is the default - * setting and provides the fastest collation but will only produce - * correct results for languages that do not use accents.
- *Note this value is different from JDK's
+ *+ * Decomposition mode value. With NO_DECOMPOSITION set, Strings will not be + * decomposed for collation. This is the default + * decomposition setting unless otherwise specified by the locale used + * to create the Collator. + *
+ *+ * Note this value is different from JDK's + *
+ * @see #CANONICAL_DECOMPOSITION * @see #getDecomposition * @see #setDecomposition * @draft 2.2 */ - public final static int NO_DECOMPOSITION - = RuleBasedCollator.AttributeValue.OFF_; - + public final static int NO_DECOMPOSITION = 16; /** - *Decomposition mode value. With CANONICAL_DECOMPOSITION set, + *
+ * Decomposition mode value. With CANONICAL_DECOMPOSITION set, * characters that are canonical variants according to Unicode 2.0 will be - * decomposed for collation. This should be used to get correct collation - * of accented characters.
- *CANONICAL_DECOMPOSITION corresponds to Normalization Form D as + * decomposed for collation. + *
+ *+ * CANONICAL_DECOMPOSITION corresponds to Normalization Form D as * described in - * Unicode Technical Report #15.
+ * Unicode Technical Report #15. + * + * @see #NO_DECOMPOSITION * @see #getDecomposition * @see #setDecomposition * @draft 2.2 @@ -173,9 +219,15 @@ public abstract class Collator // public setters -------------------------------------------------------- /** - *Sets this Collator's strength property. The strength property + *
+ * Sets this Collator's strength property. The strength property * determines the minimum level of difference considered significant - * during comparison.
+ * during comparison. + * + *+ * The default strength for the Collator is TERTIARY, unless specified + * otherwise by the locale used to create the Collator. + *
*See the Collator class description for an example of use.
* @param the new strength value. * @see #getStrength @@ -185,10 +237,11 @@ public abstract class Collator * @see #QUATERNARY * @see #IDENTICAL * @exception IllegalArgumentException If the new strength value is not one - * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. + * of PRIMARY, SECONDARY, TERTIARY, QUATERNARY or IDENTICAL. * @draft 2.2 */ - public void setStrength(int newStrength) { + public void setStrength(int newStrength) + { if ((newStrength != PRIMARY) && (newStrength != SECONDARY) && (newStrength != TERTIARY) && @@ -200,18 +253,38 @@ public abstract class Collator } /** - * Set the decomposition mode of this Collator. See getDecomposition - * for a description of decomposition mode. + *+ * Set the decomposition mode of this Collator. + * Setting this decomposition property with CANONICAL_DECOMPOSITION allows + * the Collator to handle + * un-normalized text properly, producing the same results as if the text + * were normalized. If NO_DECOMPOSITION is set, it is the user's + * responsibility to insure that all text is already in the appropriate + * form before a comparison or before getting a CollationKey. Adjusting + * decomposition mode allows the user to select between faster and more + * complete collation behavior. + *
+ *+ * Since a great majority of the world languages does not require text + * normalization, most locales has NO_DECOMPOSITION has the default + * decomposition mode. + *
+ * The default decompositon mode for the Collator is NO_DECOMPOSITON, + * unless specified otherwise by the locale used to create the Collator. + *
+ *+ * See getDecomposition for a description of decomposition mode. + *
* @param decomposition the new decomposition mode * @see #getDecomposition * @see #NO_DECOMPOSITION * @see #CANONICAL_DECOMPOSITION - * @see #FULL_DECOMPOSITION - * @exception IllegalArgumentException If the given value is not a valid decomposition - * mode. + * @exception IllegalArgumentException If the given value is not a valid + * decomposition mode. * @draft 2.2 */ - public void setDecomposition(int decomposition) { + public void setDecomposition(int decomposition) + { if ((decomposition != NO_DECOMPOSITION) && (decomposition != CANONICAL_DECOMPOSITION)) { throw new IllegalArgumentException("Wrong decomposition mode."); @@ -225,9 +298,11 @@ public abstract class Collator * Gets the Collator for the current default locale. * The default locale is determined by java.util.Locale.getDefault(). * @return the Collator for the default locale (for example, en_US) if it - * is created successfully, otherwise if there is a failure, - * null will be returned. + * is created successfully. Otherwise if there is no Collator + * associated with the current locale, the default UCA collator + * will be returned. * @see java.util.Locale#getDefault + * @see #getInstance(Locale) * @draft 2.2 */ public static final Collator getInstance() @@ -238,11 +313,13 @@ public abstract class Collator /** * Gets the Collator for the desired locale. * @param locale the desired locale. - * @return Collator for the desired locale if it is created successfully, - * otherwise if there is a failure, the default UCA collator will - * be returned. + * @return Collator for the desired locale if it is created successfully. + * Otherwise if there is no Collator + * associated with the current locale, the default UCA collator + * will be returned. * @see java.util.Locale * @see java.util.ResourceBundle + * @see #getInstance() * @draft 2.2 */ public static final Collator getInstance(Locale locale) @@ -256,15 +333,19 @@ public abstract class Collator } /** - *Returns this Collator's strength property. The strength property - * determines the minimum level of difference considered significant - * during comparison.
- *See the Collator class description for an example of use.
+ *+ * Returns this Collator's strength property. The strength property + * determines the minimum level of difference considered significant. + *
+ *+ * See the Collator class description for more details. + *
* @return this Collator's current strength property. * @see #setStrength * @see #PRIMARY * @see #SECONDARY * @see #TERTIARY + * @see #QUATERNARY * @see #IDENTICAL * @draft 2.2 */ @@ -274,24 +355,17 @@ public abstract class Collator } /** - *Get the decomposition mode of this Collator. Decomposition mode - * determines how Unicode composed characters are handled. Adjusting - * decomposition mode allows the user to select between faster and more - * complete collation behavior. - *
The three values for decomposition mode are: - *
+ * Get the decomposition mode of this Collator. Decomposition mode + * determines how Unicode composed characters are handled. + *
+ *+ * See the Collator class description for more details. *
* @return the decomposition mode * @see #setDecomposition * @see #NO_DECOMPOSITION * @see #CANONICAL_DECOMPOSITION - * @see #FULL_DECOMPOSITION * @draft 2.2 */ public int getDecomposition() @@ -302,91 +376,68 @@ public abstract class Collator // public other methods ------------------------------------------------- /** - * Convenience method for comparing the equality of two strings based on - * this Collator's collation rules. + * Convenience method for comparing the equality of two text Strings based + * on this Collator's collation rules, strength and decomposition mode. * @param source the source string to be compared with. * @param target the target string to be compared with. * @return true if the strings are equal according to the collation * rules. false, otherwise. * @see #compare + * @exception NullPointerException thrown if either arguments is null. * @draft 2.2 */ - public boolean equals(String source, String target) + public boolean equals(String source, String target) { return (compare(source, target) == 0); } - - /** - * Cloning this Collator. - * @return a cloned Collator of this object - * @draft 2.2 - */ - public Object clone() - { - try { - return (Collator)super.clone(); - } catch (CloneNotSupportedException e) { - throw new InternalError(); - } - } /** * Compares the equality of two Collators. * @param that the Collator to be compared with this. * @return true if this Collator is the same as that Collator; - * false otherwise. + * false otherwise. * @draft 2.2 */ - public boolean equals(Object that) - { - if (this == that) { - return true; - } - if (that == null || getClass() != that.getClass()) { - return false; - } - Collator other = (Collator) that; - return ((m_strength_ == other.m_strength_) && - (m_decomposition_ == other.m_decomposition_)); - } + public abstract boolean equals(Object that); // public abstract methods ----------------------------------------------- /** - * Generates the hash code for this Collator. + * Generates a unique hash code for this Collator. * @draft 2.2 + * @return 32 bit unique hash code */ public abstract int hashCode(); /** - *Compares the source string to the target string according to the - * collation rules for this Collator. Returns an integer less than, equal - * to or greater than zero depending on whether the source String is less - * than, equal to or greater than the target string. See the Collator - * class description for an example of use.
- *For a one time comparison, this method has the best performance. If - * a given String will be involved in multiple comparisons, - * CollationKey.compareTo() has the best performance. See the Collator - * class description for an example using CollationKeys.
- * @param source the source string. - * @param target the target string. + *+ * Compares the source text String to the target text String according to + * the collation rules, strength and decomposition mode for this Collator. + * Returns an integer less than, + * equal to or greater than zero depending on whether the source String is + * less than, equal to or greater than the target String. See the Collator + * class description for an example of use. + *
+ * @param source the source String. + * @param target the target String. * @return Returns an integer value. Value is less than zero if source is * less than target, value is zero if source and target are equal, * value is greater than zero if source is greater than target. * @see CollationKey * @see #getCollationKey + * @exception NullPointerException thrown if either arguments is null. * @draft 2.2 */ public abstract int compare(String source, String target); /** - *Transforms the String into a series of bits that can be compared - * bitwise to other CollationKeys. CollationKeys provide better - * performance than Collator.compare() when Strings are involved in - * multiple comparisons.
- *See the Collator class description for an example using - * CollationKeys.
- * @param source the string to be transformed into a collation key. + *+ * Transforms the String into a series of bits that can be compared + * bitwise to other CollationKeys. Bits generated depends on the collation + * rules, strength and decomposition mode. + *
+ *See the CollationKey class documentation for more information.
+ * @param source the string to be transformed into a CollationKey. * @return the CollationKey for the given String based on this Collator's * collation rules. If the source String is null, a null * CollationKey is returned. @@ -396,35 +447,18 @@ public abstract class Collator */ public abstract CollationKey getCollationKey(String source); - // protected data members ------------------------------------------------ + // protected constructor ------------------------------------------------- + + + // private data members -------------------------------------------------- /** * Collation strength */ - protected int m_strength_; + private int m_strength_ = TERTIARY; /** * Decomposition mode */ - protected int m_decomposition_; - - // protected constructor ------------------------------------------------- - - /** - *Protected constructor for use by subclasses. - * Public access to creating Collators is handled by the API getInstance(). - *
- * @draft 2.2 - */ - protected Collator() throws Exception - { - m_strength_ = TERTIARY; - m_decomposition_ = CANONICAL_DECOMPOSITION; - } - - // protected methods ----------------------------------------------------- - - // private variables ----------------------------------------------------- - - // private methods ------------------------------------------------------- + private int m_decomposition_ = CANONICAL_DECOMPOSITION; } diff --git a/icu4j/src/com/ibm/icu/text/CollatorReader.java b/icu4j/src/com/ibm/icu/text/CollatorReader.java index 7b6f4b0f2a9..bdf8f8cd325 100644 --- a/icu4j/src/com/ibm/icu/text/CollatorReader.java +++ b/icu4j/src/com/ibm/icu/text/CollatorReader.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollatorReader.java,v $ -* $Date: 2002/05/16 20:04:49 $ -* $Revision: 1.2 $ +* $Date: 2002/06/21 23:56:47 $ +* $Revision: 1.3 $ * ******************************************************************************* */ @@ -140,26 +140,28 @@ final class CollatorReader * @exception IOException thrown when there's a data error. * @draft 2.2 */ - public void readOptions(RuleBasedCollator rbc) throws IOException + protected void readOptions(RuleBasedCollator rbc) throws IOException { rbc.m_variableTopValue_ = m_dataInputStream_.readInt(); - rbc.setAttributeDefault(RuleBasedCollator.Attribute.FRENCH_COLLATION_, - m_dataInputStream_.readInt()); - rbc.setAttributeDefault( - RuleBasedCollator.Attribute.ALTERNATE_HANDLING_, - m_dataInputStream_.readInt()); - rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_FIRST_, - m_dataInputStream_.readInt()); - rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_LEVEL_, - m_dataInputStream_.readInt()); - rbc.setAttributeDefault( - RuleBasedCollator.Attribute.NORMALIZATION_MODE_, - m_dataInputStream_.readInt()); - rbc.setAttributeDefault(RuleBasedCollator.Attribute.STRENGTH_, - m_dataInputStream_.readInt()); - rbc.setAttributeDefault( - RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_, - m_dataInputStream_.readInt()); + rbc.m_defaultIsFrenchCollation_ = (m_dataInputStream_.readInt() + == RuleBasedCollator.AttributeValue.ON_); + rbc.m_defaultIsAlternateHandlingShifted_ + = (m_dataInputStream_.readInt() == + RuleBasedCollator.AttributeValue.SHIFTED_); + rbc.m_defaultCaseFirst_ = m_dataInputStream_.readInt(); + rbc.m_defaultIsCaseLevel_ = (m_dataInputStream_.readInt() + == RuleBasedCollator.AttributeValue.ON_); + int value = m_dataInputStream_.readInt(); + if (value == RuleBasedCollator.AttributeValue.ON_) { + value = Collator.CANONICAL_DECOMPOSITION; + } + else { + value = Collator.NO_DECOMPOSITION; + } + rbc.m_defaultDecomposition_ = value; + rbc.m_defaultStrength_ = m_dataInputStream_.readInt(); + rbc.m_defaultIsHiragana4_ = (m_dataInputStream_.readInt() + == RuleBasedCollator.AttributeValue.ON_); } /** @@ -169,7 +171,7 @@ final class CollatorReader * @exception IOException thrown when there's a data error. * @draft 2.2 */ - public void read(RuleBasedCollator rbc) throws IOException + protected void read(RuleBasedCollator rbc) throws IOException { readHeader(rbc); readOptions(rbc); @@ -188,7 +190,8 @@ final class CollatorReader for (int i = 0; i < m_contractionCESize_; i ++) { rbc.m_contractionCE_[i] = m_dataInputStream_.readInt(); } - rbc.m_trie_ = new IntTrie(m_dataInputStream_, rbc); + rbc.m_trie_ = new IntTrie(m_dataInputStream_, + RuleBasedCollator.DataManipulate.getInstance()); if (!rbc.m_trie_.isLatin1Linear()) { throw new IOException("Data corrupted, " + "Collator Tries expected to have linear " @@ -213,6 +216,43 @@ final class CollatorReader } } + /** + * Reads in the inverse uca data + * @param input input stream with the inverse uca data + * @return an object containing the inverse uca data + * @exception IOException thrown when error occurs while reading the + * inverse uca + */ + protected static CollationParsedRuleBuilder.InverseUCA readInverseUCA( + InputStream inputStream) + throws IOException + { + ICUBinary.readHeader(inputStream, INVERSE_UCA_DATA_FORMAT_ID_, + DATA_FORMAT_VERSION_, UNICODE_VERSION_); + CollationParsedRuleBuilder.InverseUCA result = + new CollationParsedRuleBuilder.InverseUCA(); + DataInputStream input = new DataInputStream(inputStream); + int bytesize = input.readInt(); + int tablesize = input.readInt(); // in int size + int contsize = input.readInt(); // in char size + int table = input.readInt(); // in bytes + int conts = input.readInt(); // in bytes + int size = tablesize * 3; // one column for each strength + result.m_table_ = new int[size]; + result.m_continuations_ = new char[contsize]; + + for (int i = 0; i < size; i ++) { + result.m_table_[i] = input.readInt(); + } + for (int i = 0; i < contsize; i ++) { + result.m_continuations_[i] = input.readChar(); + } + input.close(); + return result; + } + + // private inner class ----------------------------------------------- + // private variables ------------------------------------------------- /** @@ -231,6 +271,14 @@ final class CollatorReader private static final byte UNICODE_VERSION_[] = {(byte)0x3, (byte)0x0, (byte)0x0, (byte)0x0}; /** + * Inverse UCA file format version and id that this class understands. + * No guarantees are made if a older version is used + */ + private static final byte INVERSE_UCA_DATA_FORMAT_ID_[] = {(byte)0x49, + (byte)0x6e, + (byte)0x76, + (byte)0x43}; + /** * Corrupted error string */ private static final String CORRUPTED_DATA_ERROR_ = diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java index 9284ebb030e..fe144ad8fbb 100755 --- a/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java +++ b/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java @@ -5,282 +5,240 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedCollator.java,v $ -* $Date: 2002/05/22 01:14:38 $ -* $Revision: 1.7 $ +* $Date: 2002/06/21 23:56:47 $ +* $Revision: 1.8 $ * ******************************************************************************* */ package com.ibm.icu.text; import java.io.InputStream; -import java.io.DataInputStream; import java.io.BufferedInputStream; -import java.io.IOException; import java.io.ByteArrayInputStream; import java.util.Locale; import java.util.ResourceBundle; -import java.util.MissingResourceException; import java.text.CharacterIterator; import java.text.StringCharacterIterator; +import java.text.ParseException; import com.ibm.icu.impl.IntTrie; import com.ibm.icu.impl.Trie; import com.ibm.icu.impl.NormalizerImpl; import com.ibm.icu.impl.ICULocaleData; /** -*The RuleBasedCollator class is a concrete subclass of Collator that -* provides a simple, data-driven, table collator. With this class you can -* create a customized table-based Collator. RuleBasedCollator maps characters -* to sort keys.
-*RuleBasedCollator has the following restrictions for efficiency (other -* subclasses may be used for more complex languages) : -*
The collation table is composed of a list of collation rules, where each -* rule is of three forms: -*
-* <modifier> -* <relation> <text-argument> -* <reset> <text-argument> -*-* -*
The definitions of the rule elements is as follows: -*
b c
is treated as bc
.
-* '@' : Indicates that accents are sorted backwards, as in French. -*
'&' : Indicates that the next rule follows the position to where -* the reset text-argument would be sorted. -*
-* This sounds more complicated than it is in practice. For example, the -* following are equivalent ways of expressing the same thing: -*
-*-* Notice that the order is important, as the subsequent item goes immediately -* after the text-argument. The following are not equivalent: -*-* a < b < c -* a < b & b < c -* a < c & a < b -*-*
-*-* Either the text-argument must already be present in the sequence, or some -* initial substring of the text-argument must be present. -* (e.g. "a < b & ae < e" is valid since "a" is present in the -* sequence before "ae" is reset). In this latter case, "ae" is not entered and -* treated as a single character; instead, "e" is sorted as if it were expanded -* to two characters: "a" followed by an "e". This difference appears in -* natural languages: in traditional Spanish "ch" is treated as though it -* contracts to a single character (expressed as "c < ch < d"), while in -* traditional German a-umlaut is treated as though it expanded to two -* characters (expressed as -* "a,A < b,B ... &ae;\u00e3&AE;\u00c3"). -* [\u00e3 and \u00c3 are, of course, the escape sequences for -* a-umlaut.] -* -*-* a < b & a < c -* a < c & a < b -*-*
-* Ignorable Characters -*
-* For ignorable characters, the first rule must start with a relation (the -* examples we have used above are really fragments; "a < b" really should -* be "< a < b"). If, however, the first relation is not "<", then all -* the all text-arguments up to the first "<" are ignorable. For example, -* ", - < a < b" makes "-" an ignorable character, as we saw earlier in -* the word "black-birds". In the samples for different languages, you see that -* most accents are ignorable.
-*Normalization and Accents -*
RuleBasedCollator
automatically processes its rule table to
-* include both pre-composed and combining-character versions of accented
-* characters. Even if the provided rule string contains only base characters
-* and separate combining accent characters, the pre-composed accented
-* characters matching all canonical combinations of characters from the rule
-* string will be entered in the table.
This allows you to use a RuleBasedCollator to compare accented strings -* even when the collator is set to NO_DECOMPOSITION. There are two caveats, -* however. First, if the strings to be collated contain combining sequences -* that may not be in canonical order, you should set the collator to -* CANONICAL_DECOMPOSITION or FULL_DECOMPOSITION to enable sorting of combining -* sequences. Second, if the strings contain characters with compatibility -* decompositions (such as full-width and half-width forms), you must use -* FULL_DECOMPOSITION, since the rule tables only include canonical mappings. -*
-*Errors
-*The following are errors:
-*If you produce one of these errors, a RuleBasedCollator
-* throws a ParseException
.
Examples
-*Simple: "< a < b < c < d"
-*Norwegian: "< a,A< b,B< c,C< d,D< e,E< f,F< " + -* "g,G< h,H< i,I< j,J< k,K< l,L< m,M< " + -* "n,N< o,O< p,P< q,Q< r,R< s,S< t,T< " + -* "u,U< v,V< w,W< x,X< y,Y< z,Z< " + -* "\u00E5=a\ u030A,\u00C5=A\u030A;aa,AA< " + -* "\u00E6,\ u00C6< \u00F8,\u00D8"
-*Normally, to create a rule-based Collator object, you will use
-* Collator
's factory method getInstance
. However, to
-* create a rule-based Collator object with specialized rules tailored to your
-* needs, you construct the RuleBasedCollator
with the rules
-* contained in a String
object. For example:
-*-* Or: -*-* String Simple = "< a< b< c< d"; -* RuleBasedCollator mySimple = new RuleBasedCollator(Simple); -*-*
-*-*-* String Norwegian = "< a,A< b,B< c,C< d,D< e,E< f,F<" + -* "g,G< h,H< i,I< j,J < k,K< l,L< " + -* "m,M< n,N< o,O< p,P< q,Q< r,R< " + -* "s,S< t,T < u,U< v,V< w,W< x,X< " + -* "y,Y< z,Z < \u00E5=a\u030A," + -* "\u00C5=A\u030A;aa,AA< \u00E6," + -* "\u00C6< \u00F8,\u00D8"; -* RuleBasedCollator myNorwegian = new RuleBasedCollator(Norwegian); -*-*
Combining Collator
s is as simple as concatenating strings.
-* Here's an example that combines two Collator
s from two
-* different locales:
-*-*-* // Create an en_US Collator object -* RuleBasedCollator en_USCollator = (RuleBasedCollator) -* Collator.getInstance(new Locale("en", "US", "")); -* // Create a da_DK Collator object -* RuleBasedCollator da_DKCollator = (RuleBasedCollator) -* Collator.getInstance(new Locale("da", "DK", "")); -* // Combine the two -* // First, get the collation rules from en_USCollator -* String en_USRules = en_USCollator.getRules(); -* // Second, get the collation rules from da_DKCollator -* String da_DKRules = da_DKCollator.getRules(); -* RuleBasedCollator newCollator = -* new RuleBasedCollator(en_USRules + da_DKRules); -* // newCollator has the combined rules -*-*
Another more interesting example would be to make changes on an existing
-* table to create a new Collator
object. For example, add
-* "&C< ch, cH, Ch, CH" to the en_USCollator
object to
-* create your own:
-*-*-* // Create a new Collator object with additional rules -* String addRules = "&C< ch, cH, Ch, CH"; -* RuleBasedCollator myCollator = -* new RuleBasedCollator(en_USCollator + addRules); -* // myCollator contains the new rules -*-*
The following example demonstrates how to change the order of -* non-spacing accents, -*
-*-*-* // old rule -* String oldRules = -* "=\u0301;\u0300;\u0302;\u0308" // main accents -* + ";\u0327;\u0303;\u0304;\u0305" // main accents -* + ";\u0306;\u0307;\u0309;\u030A" // main accents -* + ";\u030B;\u030C;\u030D;\u030E" // main accents -* + ";\u030F;\u0310;\u0311;\u0312" // main accents -* + "< a , A ; ae, AE ; \u00e6 , \u00c6" -* + "< b , B < c, C < e, E & C < d, D"; -* // change the order of accent characters -* String addOn = "& \u0300 ; \u0308 ; \u0302"; -* RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn); -*-*
The last example shows how to put new primary ordering in before the
-* default setting. For example, in Japanese Collator
, you
-* can either sort English characters before or after Japanese characters,
-*
-*-* // get en_US Collator rules -* RuleBasedCollator en_USCollator = (RuleBasedCollator) -* Collator.getInstance(Locale.US); -* // add a few Japanese character to sort before English characters -* // suppose the last character before the first base letter 'a' in -* // the English collation rule is \u2212 -* String jaString = "& \u2212 < \u3041, \u3042 < \u3043, \u3044"; -* RuleBasedCollator myJapaneseCollator = new -* RuleBasedCollator(en_USCollator.getRules() + jaString); -*-* @author Syn Wee Quek -* @since release 2.2, April 18 2002 -* @draft 2.2 -*/ -public class RuleBasedCollator extends Collator implements Trie.DataManipulate -{ + *+ * The RuleBasedCollator class is a concrete subclass of Collator. It allows + * customization of the Collator via user specified rule sets. + * RuleBasedCollator is designed to be fully compliant to the + * + * Unicode Collation Algorithm (UCA) and conforms to ISO 14651. + *
+ *+ * Users are strongly encouraged to read + * + * the users guide for more information about the collation service before + * using this class. + *
+ *+ * Create a RuleBasedCollator from a locale by calling the getInstance(Locale) + * factory method in the base class Collator. + * Collator.getInstance(Locale) creates a RuleBasedCollator object based on the + * collation rules defined by the argument locale. + * If a customized collation ordering ar attributes is required, use the + * RuleBasedCollator(String) constructor with the appropriate rules. The + * customized RuleBasedCollator will base its ordering on UCA, while + * re-adjusting the attributes and orders of the characters in the specified + * rule accordingly. + *
+ * RuleBasedCollator provides correct collation orders for most locales + * supported in ICU. If specific data for a locale is not available, the orders + * eventually falls back to the + * UCA collation order + * . + *
+ *+ * For information about the collation rule syntax to use and details about + * customization, please refer to the + * + * Collation customization section of the users guide. + *
+ *+ * Note that there are some differences between the Collation rule syntax + * used in Java and ICU4J + *
+ *
+ * + *- According to the JDK documentation: + * + *
+ * Modifier '!' : Turns on Thai/Lao vowel-consonant swapping. If this rule + * is in force when a Thai vowel of the range \U0E40-\U0E44 precedes a + * Thai consonant of the range \U0E01-\U0E2E OR a Lao vowel of the + * range \U0EC0-\U0EC4 precedes a Lao consonant of the range + * \U0E81-\U0EAE then the + * vowel is placed after the consonant for collation purposes. + *
+ *+ * If a rule is without the modifier '!', the Thai/Lao vowel-consonant + * swapping is not turned on. + *
+ * + *+ * ICU4J's RuleBasedCollator does not support turning off the Thai/Lao + * vowel-consonant swapping, since the UCA clearly states that it has to be + * supported to ensure a correct sorting order. If a '!' is encountered, it is + * ignored. + *
+ *- According to the JDK documentation: + * + *
+ * If, however, the first relation is not "<", then all the all + * text-arguments up to the first "<" are ignorable. For example, + * ", - < a < b" makes "-" an ignorable character, as we saw earlier in + * the word "black-birds". + *
+ * + *+ * The above allows random characters before the first '<' not in any + * specific sequence to be ignorable. ICU4J does not support this feature. + * To define ignorable characters in PRIMARY to TERTIARY strength, users can + * use the rule "& X < [variable top]" to set the variable top to the + * PRIMARY strength of "X". Once alternate handling is set to shifted + * (setAlternateHandling(true)), the Collator using strengths PRIMARY, + * SECONDARY or TERTIARY will ignore all code points with PRIMARY strengths + * less than variable top. + * See the user guide's section on + * + * Collation Customization for details. + *
+ *- As mentioned in the documentation of the base class Collator, + * compatibility decomposition mode is not supported. + *
+ * Examples + *
+ *+ * Creating Customized RuleBasedCollators + *
+ *+ * Concatenating rules to combining+ * String Simple = "& a < b < c < d"; + * RuleBasedCollator mySimple = new RuleBasedCollator(Simple); + * + * String Norwegian = "& a , A < b , B < c , C < d , D < e , E " + * + "< f , F < g , G < h , H < i , I < j , " + * + "J < k , K < l , L < m , M < n , N < " + * + "o , O < p , P < q , Q < r , R < s , S < " + * + "t , T < u , U < v , V < w , W < x , X " + * + "< y , Y < z , Z < \u00E5 = a\u030A " + * + ", \u00C5 = A\u030A ; aa , AA < \u00E6 " + * + ", \u00C6 < \u00F8 , \u00D8"; + * RuleBasedCollator myNorwegian = new RuleBasedCollator(Norwegian); + *+ *Collator
s + *+ *+ * Making changes on an existing RuleBasedCollator to create a new + *+ * // Create an en_US Collator object + * RuleBasedCollator en_USCollator = (RuleBasedCollator) + * Collator.getInstance(new Locale("en", "US", "")); + * // Create a da_DK Collator object + * RuleBasedCollator da_DKCollator = (RuleBasedCollator) + * Collator.getInstance(new Locale("da", "DK", "")); + * // Combine the two + * // First, get the collation rules from en_USCollator + * String en_USRules = en_USCollator.getRules(); + * // Second, get the collation rules from da_DKCollator + * String da_DKRules = da_DKCollator.getRules(); + * RuleBasedCollator newCollator = + * new RuleBasedCollator(en_USRules + da_DKRules); + * // newCollator has the combined rules + *+ *Collator
object, by appending the existing rule with the + * changes. + *+ *+ * The following example demonstrates how to change the order of + * non-spacing accents, + *+ * // Create a new Collator object with additional rules + * String addRules = "& C < ch, cH, Ch, CH"; + * RuleBasedCollator myCollator = + * new RuleBasedCollator(en_USCollator + addRules); + * // myCollator contains the new rules + *+ *+ *+ * Putting new primary ordering in before the default setting, + * e.g. Sort English characters before or after Japanese characters in Japanese + *+ * // old rule with main accents + * String oldRules = "= \u0301 ; \u0300 ; \u0302 ; \u0308 " + * + "; \u0327 ; \u0303 ; \u0304 ; \u0305 " + * + "; \u0306 ; \u0307 ; \u0309 ; \u030A " + * + "; \u030B ; \u030C ; \u030D ; \u030E " + * + "; \u030F ; \u0310 ; \u0311 ; \u0312 " + * + "< a , A ; ae, AE ; \u00e6 , \u00c6 " + * + "< b , B < c, C < e, E & C < d , D"; + * // change the order of accent characters + * String addOn = "& \u0300 ; \u0308 ; \u0302"; + * RuleBasedCollator myCollator = new RuleBasedCollator(oldRules + addOn); + *+ *Collator
. + *+ *+ * + * @author Syn Wee Quek + * @since release 2.2, April 18 2002 + * @draft 2.2 + */ +public final class RuleBasedCollator extends Collator +{ // public data members --------------------------------------------------- // public constructors --------------------------------------------------- /** - *+ * // get en_US Collator rules + * RuleBasedCollator en_USCollator + * = (RuleBasedCollator)Collator.getInstance(Locale.US); + * // add a few Japanese character to sort before English characters + * // suppose the last character before the first base letter 'a' in + * // the English collation rule is \u2212 + * String jaString = "& \u2212 < \u3041, \u3042 < \u3043, " + * + "\u3044"; + * RuleBasedCollator myJapaneseCollator + * = new RuleBasedCollator(en_USCollator.getRules() + jaString); + *+ *RuleBasedCollator constructor that takes the rules. - * Please see RuleBasedCollator class description for more details on the - * collation rule syntax.
- *Note different from Java, does not throw a ParseException
- * @see java.util.Locale + *+ * RuleBasedCollator constructor that takes the argument rules for + * customization. RuleBasedCollator constructed will be based on UCA, + * with the attributes and re-ordering of the characters specified in the + * argument rules. + *
+ *See the user guide's section on + * + * Collation Customization for details on the rule syntax. + *
* @param rules the collation rules to build the collation table from. - * @exception Exception thrown when there's an error creating the collator + * @exception ParseException and IOException thrown. ParseException thrown + * when argument rules have an invalid syntax. IOException + * thrown when an error occured while reading internal data. * @draft 2.2 */ public RuleBasedCollator(String rules) throws Exception { - setStrength(Collator.TERTIARY); - setDecomposition(Collator.CANONICAL_DECOMPOSITION); + if (rules == null) { + throw new IllegalArgumentException( + "Collation rules can not be null"); + } + setWithUCAData(); + CollationParsedRuleBuilder builder + = new CollationParsedRuleBuilder(rules); + + builder.setRules(this); m_rules_ = rules; - // tables = new RBCollationTables(rules, decomp); init(); } @@ -313,36 +271,55 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate // public setters -------------------------------------------------------- /** - * Sets the Hiragana Quartenary sort to be on or off - * @param flag true if Hiragana Quartenary sort is to be on, false + * Sets the Hiragana Quaternary mode to be on or off. + * When the Hiragana Quaternary mode turned on, the RuleBasedCollator + * positions Hiragana characters before all non-ignorable characters in + * QUATERNARY strength. This is to produce a correct JIS collation order, + * distinguishing between Katakana and Hiragana characters. + * @param flag true if Hiragana Quaternary mode is to be on, false * otherwise + * @see #setHiraganaQuaternaryDefault + * @see #isHiraganaQuaternary * @draft 2.2 */ - public void setHiraganaQuartenary(boolean flag) + public void setHiraganaQuaternary(boolean flag) { m_isHiragana4_ = flag; } /** - * Sets the Hiragana Quartenary sort to be on or off depending on the - * Collator's locale specific default value. + * Sets the Hiragana Quaternary mode to the initial mode set during + * construction of the RuleBasedCollator. + * See setHiraganaQuaternary(boolean) for more details. + * @see #setHiraganaQuaternary(boolean) + * @see #isHiraganaQuaternary * @draft 2.2 */ - public void setHiraganaQuartenaryDefault() + public void setHiraganaQuaternaryDefault() { m_isHiragana4_ = m_defaultIsHiragana4_; } /** - * Sets the Collator to sort with the indicated casing first - * @param upper true for sorting uppercased characters before lowercased - * characters, false for sorting lowercased characters before - * uppercased characters + * Sets the orders of upper cased characters to sort before lower cased + * characters or vice versa, in strength TERTIARY. The default + * mode is false, and that sorts lower cased characters before upper cased + * characters. + * If true is set, the RuleBasedCollator will sort upper cased characters + * before the lower cased ones. + * @param upperfirst true for sorting upper cased characters before + * lower cased characters, false for sorting lower cased + * characters before upper cased characters + * @see #setCaseFirstOff + * @see #isCaseFirstOff + * @see #isLowerCaseFirst + * @see #isUpperCaseFirst + * @see #setCaseFirstDefault * @draft 2.2 */ - public void setCaseFirst(boolean upper) + public void setCaseFirst(boolean upperfirst) { - if (upper) { + if (upperfirst) { m_caseFirst_ = AttributeValue.UPPER_FIRST_; } else { @@ -355,6 +332,11 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * Sets the Collator to ignore any previous setCaseFirst(boolean) calls. * Ignores case preferences. * @draft 2.2 + * @see #setCaseFirst(boolean) + * @see #isCaseFirstOff + * @see #isLowerCaseFirst + * @see #isUpperCaseFirst + * @see #setCaseFirstDefault */ public void setCaseFirstOff() { @@ -363,10 +345,13 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Sets the case sorting preferences to the Collator's locale specific - * default value. - * @see #setCaseFirst(boolean) + * Sets the case first mode to the initial mode set during + * construction of the RuleBasedCollator. + * See setCaseFirst(boolean) for more details. * @see #setCaseFirstOff + * @see #isCaseFirstOff + * @see #isUpperCaseFirst + * @see #setCaseFirst * @draft 2.2 */ public final void setCaseFirstDefault() @@ -375,10 +360,12 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate updateInternalState(); } - /** - * Sets the alternate handling value for quartenary strength to the - * Collator's locale specific default value. - * @see #setAlternateHandling + /** + * Sets the alternate handling mode to the initial mode set during + * construction of the RuleBasedCollator. + * See setAlternateHandling(boolean) for more details. + * @see #setAlternateHandling(boolean) + * @see #isAlternateHandling(boolean) * @draft 2.2 */ public void setAlternateHandlingDefault() @@ -388,8 +375,11 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Sets case level sorting to the Collator's locale specific default value. - * @see #setCaseLevel + * Sets the case level mode to the initial mode set during + * construction of the RuleBasedCollator. + * See setCaseLevel(boolean) for more details. + * @see #setCaseLevel(boolean) + * @see #isCaseLevel * @draft 2.2 */ public void setCaseLevelDefault() @@ -399,19 +389,24 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Set the decomposition mode to the Collator's locale specific default - * value. + * Sets the decomposition mode to the initial mode set during construction + * of the RuleBasedCollator. + * See setDecomposition(int) for more details. * @see #getDecomposition + * @see #setDecomposition(int) * @draft 2.2 */ public void setDecompositionDefault() { - m_decomposition_ = m_defaultDecomposition_; + setDecomposition(m_defaultDecomposition_); } /** - * Sets French collation to the Collator's locale specific default value. - * @see #getFrenchCollation + * Sets the French collation mode to the initial mode set during + * construction of the RuleBasedCollator. + * See setFrenchCollation(boolean) for more details. + * @see #isFrenchCollation + * @see #setFrenchCollation(boolean) * @draft 2.2 */ public void setFrenchCollationDefault() @@ -421,20 +416,31 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - *Sets strength to the Collator's locale specific default value.
- * @see #setStrength + * Sets the collation strength to the initial mode set during the + * construction of the RuleBasedCollator. + * See setStrength(int) for more details. + * @see #setStrength(int) + * @see #getStrength * @draft 2.2 */ public void setStrengthDefault() { - m_strength_ = m_defaultStrength_; - updateInternalState(); + setStrength(m_defaultStrength_); } /** - * Sets the French collation + * Sets the mode for the direction of SECONDARY weights to be used in + * French collation. + * The default value is false which treats SECONDARY weights in the order + * they appear. + * If true is set, the SECONDARY weights will be sorted backwards. + * See the section on + * + * French collation for more information. * @param flag true to set the French collation on, false to set it off * @draft 2.2 + * @see #isFrenchCollation + * @see #setFrenchCollationDefault */ public void setFrenchCollation(boolean flag) { @@ -443,11 +449,24 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Sets the alternate handling for quartenary strength to be either - * shifted or non-ignorable. This attribute will only be effective with - * a quartenary strength sort. - * @param shifted true if shifted for alternate handling is desired, false - * for the non-ignorable. + * Sets the alternate handling for Quaternary strength to be either + * shifted or non-ignorable. + * See the UCA definition on + * + * Alternate Weighting. + * This attribute will only be effective when QUATERNARY strength is set. + * The default value for this mode is false, corresponding to the + * NON_IGNORABLE mode in UCA. In the NON-IGNORABLE mode, the + * RuleBasedCollator will treats all the codepoints with non-ignorable + * primary weights in the same way. + * If the mode is set to true, the behaviour corresponds to SHIFTED defined + * in UCA, this causes codepoints with PRIMARY orders that are equal or + * below the variable top value to be ignored in PRIMARY order and + * moved to the QUATERNARY order. + * @param shifted true if SHIFTED behaviour for alternate handling is + * desired, false for the NON_IGNORABLE behaviour. + * @see #isAlternateHandling(boolean) + * @see #setAlternateHandlingDefault * @draft 2.2 */ public void setAlternateHandling(boolean shifted) @@ -457,9 +476,28 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Sets if case level sorting is required. + *+ * When case level is set to true, an additional weight is formed + * between the SECONDARY and TERTIARY weight, known as the case level. + * The case level is used to distinguish large and small Japanese Kana + * characters. Case level could also be used in other situations. + * For example to distinguish certain Pinyin characters. + * The default value is false, where the case level is not generated. + * If the case level is set to true, which causes the case level to be + * generated. Contents of the case level are affected by the case first + * mode. A simple way to ignore accent differences in a string is to set + * the strength to PRIMARY and enable case level. + *
+ *+ * See the section on + * + * case level for more information. + *
* @param flag true if case level sorting is required, false otherwise * @draft 2.2 + * @see #setCaseLevelDefault + * @see #isCaseLevel + * @see #setCaseFirst(boolean) */ public void setCaseLevel(boolean flag) { @@ -468,12 +506,15 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - *Sets this Collator's strength property. The strength property + *
+ * Sets this Collator's strength property. The strength property * determines the minimum level of difference considered significant - * during comparison.
+ * during comparison. + * *See the Collator class description for an example of use.
* @param the new strength value. * @see #getStrength + * @see #setStrengthDefault * @see #PRIMARY * @see #SECONDARY * @see #TERTIARY @@ -484,58 +525,44 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * @draft 2.2 */ public void setStrength(int newStrength) { - if ((newStrength != PRIMARY) && - (newStrength != SECONDARY) && - (newStrength != TERTIARY) && - (newStrength != QUATERNARY) && - (newStrength != IDENTICAL)) { - throw new IllegalArgumentException("Incorrect comparison level."); - } - m_strength_ = newStrength; + super.setStrength(newStrength); updateInternalState(); } // public getters -------------------------------------------------------- /** - * Internal method called to parse a lead surrogate's ce for the offset - * to the next trail surrogate data. - * @param ce collation element of the lead surrogate - * @return data offset or 0 for the next trail surrogate + * Gets the collation rules for this RuleBasedCollator. + * @return returns the collation rules * @draft 2.2 */ - public int getFoldingOffset(int ce) - { - if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) { - return (ce & 0xFFFFFF); - } - return 0; - } - - /** - * Gets the collation rules for this RuleBasedCollator. * @return returns the collation rules - * @draft 2.2 - */ - public final String getRules() + public String getRules() { return m_rules_; } /** - *Transforms the String into a series of bits that can be compared - * bitwise to other CollationKeys. CollationKeys provide better - * performance than Collator.compare() when Strings are involved in - * multiple comparisons.
- *Internally CollationKey stores its data in a null-terminated byte - * array.
- *See the Collator class description for an example using - * CollationKeys.
- * @param source the string to be transformed into a collation key. - * @return the CollationKey for the given String based on this Collator's - * collation rules. If the source String is null, a null - * CollationKey is returned. + *+ * Get a Collation key for the argument String source from this + * RuleBasedCollator. + *
+ *+ * General recommendation:
+ *
+ * If comparison are to be done to the same String multiple times, it would + * be more efficient to generate CollationKeys for the Strings and use + * CollationKey.compareTo(CollationKey) for the comparisons. + * If the each Strings are compared to only once, using the method + * RuleBasedCollator.compare(String, String) will have a better performance. + *+ * See the class documentation for an explanation about CollationKeys. + *
+ * @param source the text String to be transformed into a collation key. + * @return the CollationKey for the given String based on this + * RuleBasedCollator's collation rules. If the source String is + * null, a null CollationKey is returned. * @see CollationKey - * @see compare(String, String) + * @see #compare(String, String) * @draft 2.2 */ public CollationKey getCollationKey(String source) @@ -543,19 +570,20 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate if (source == null) { return null; } + int strength = getStrength(); boolean compare[] = {m_isCaseLevel_, true, - m_strength_ >= SECONDARY, - m_strength_ >= TERTIARY, - m_strength_ >= QUATERNARY, - m_strength_ == IDENTICAL + strength >= SECONDARY, + strength >= TERTIARY, + strength >= QUATERNARY, + strength == IDENTICAL }; byte bytes[][] = {new byte[SORT_BUFFER_INIT_SIZE_CASE_], // case new byte[SORT_BUFFER_INIT_SIZE_1_], // primary new byte[SORT_BUFFER_INIT_SIZE_2_], // secondary new byte[SORT_BUFFER_INIT_SIZE_3_], // tertiary - new byte[SORT_BUFFER_INIT_SIZE_4_] // quartenary + new byte[SORT_BUFFER_INIT_SIZE_4_] // Quaternary }; int bytescount[] = {0, 0, 0, 0, 0}; int count[] = {0, 0, 0, 0, 0}; @@ -573,31 +601,27 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate int bottomCount4 = 0xFF - commonBottom4; // If we need to normalize, we'll do it all at once at the beginning! - if ((compare[5] || m_decomposition_ != NO_DECOMPOSITION) - /*&& UNORM_YES != unorm_quickCheck(source, len, normMode, status)*/ - ) { - /* - * len = unorm_internalNormalize(normSource, normSourceLen, - source, len, - normMode, FALSE, - status); - source = normSource;*/ - String norm = source; - getSortKeyBytes(norm, compare, bytes, bytescount, count, - doFrench, hiragana4, commonBottom4, bottomCount4); + if ((compare[5] || getDecomposition() != NO_DECOMPOSITION) + && Normalizer.quickCheck(source, Normalizer.NFD) + != Normalizer.YES) { + source = Normalizer.decompose(source, false); } - else { - getSortKeyBytes(source, compare, bytes, bytescount, count, doFrench, + getSortKeyBytes(source, compare, bytes, bytescount, count, doFrench, hiragana4, commonBottom4, bottomCount4); - } byte sortkey[] = getSortKey(source, compare, bytes, bytescount, count, doFrench, commonBottom4, bottomCount4); return new CollationKey(source, sortkey); } /** - * Checks if uppercase is sorted before lowercase - * @return true if Collator sorts uppercase before lower, false otherwise + * Checks if upper cased character is sorted before lower cased character. + * See setCaseFirst(boolean) for details. + * @see #setCaseFirstOff + * @see #setCaseFirst(boolean) + * @see #isLowerCaseFirst + * @see #setCaseFirstDefault + * @return true if upper cased characters are sorted before lower cased + * characters, false otherwise * @draft 2.2 */ public boolean isUpperCaseFirst() @@ -606,8 +630,14 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Checks if lowercase is sorted before uppercase - * @return true if Collator sorts lowercase before upper, false otherwise + * Checks if lower cased character is sorted before upper cased character. + * See setCaseFirst(boolean) for details. + * @see #setCaseFirstOff + * @see #setCaseFirst(boolean) + * @see #isUpperCaseFirst + * @see #setCaseFirstDefault + * @return true lower cased characters are sorted before upper cased + * characters, false otherwise * @draft 2.2 */ public boolean isLowerCaseFirst() @@ -616,8 +646,16 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Checks if case sorting is off. - * @return true if case sorting is off, false otherwise + * Checks if a previous call to setCaseFirst(boolean) is turned off + * by setCaseFirstOff(). + * See setCaseFirst(boolean) for details. + * @return true if the customized case sorting is turned off, false + * otherwise + * @see #setCaseFirstOff + * @see #setCaseFirst(boolean) + * @see #isUpperCaseFirst + * @see #isLowerCaseFirst + * @see #setCaseFirstDefault * @draft 2.2 */ public boolean isCaseFirstOff() @@ -626,28 +664,33 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Checks if the alternate handling attribute is shifted or non-ignorable. + * Checks if the alternate handling behaviour is the UCA defined SHIFTED or + * NON_IGNORABLE. **
- * @param shifted true if checks are to be done on shifted, false if - * checks are to be done on non-ignorable + * See setAlternateHandling(boolean) for more details. + * @param shifted true if checks are to be done to see if the SHIFTED + * behaviour is on, false if checks are to be done to see if the + * NON_IGNORABLE behaviour is on. * @return true or false - * @draft 2.2 - */ + * @see #setAlternateHandling(boolean) + * @see #setAlternateHandlingDefault + * @draft 2.2 + */ public boolean isAlternateHandling(boolean shifted) { if (shifted) { @@ -657,8 +700,12 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Checks if case level sorting is on - * @return true if case level sorting is on + * Checks if case level is set to true. + * See setCaseLevel(boolean) for details. + * @return the case level mode + * @see #setCaseLevelDefault + * @see #isCaseLevel + * @see #setCaseLevel(boolean) * @draft 2.2 */ public boolean isCaseLevel() @@ -667,72 +714,103 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Checks if French Collation sorting is on - * @return true if French Collation sorting is on + * Checks if French Collation is set to true. + * See setFrenchCollation(boolean) for details. + * @return true if French Collation is set to true, false otherwise + * @see #setFrenchCollation(boolean) + * @see #setFrenchCollationDefault * @draft 2.2 */ public boolean isFrenchCollation() { return m_isFrenchCollation_; } + + /** + * Checks if the Hiragana Quaternary mode is set on. + * See setHiraganaQuaternary(boolean) for more details. + * @return flag true if Hiragana Quaternary mode is on, false otherwise + * @see #setHiraganaQuaternaryDefault + * @see #setHiraganaQuaternary(boolean) + * @draft 2.2 + */ + public boolean isHiraganaQuaternary() + { + return m_isHiragana4_; + } // public other methods ------------------------------------------------- /** - * Compares the equality of two RuleBasedCollators. + * Compares the equality of two RuleBasedCollator objects. + * RuleBasedCollator objects are equivalent if they have the same collation + * rules and the same attributes. * @param obj the RuleBasedCollator to be compared with. - * @return true if this RuleBasedCollator has exactly the same behaviour - * as obj, false otherwise. + * @return true if this RuleBasedCollator has exactly the same + * collation behaviour as obj, false otherwise. * @draft 2.2 */ public boolean equals(Object obj) { - if (obj == null || !super.equals(obj)) { + if (obj == null) { return false; // super does class check } + if (this == obj) { + return true; + } + if (getClass() != obj.getClass()) { + return false; + } RuleBasedCollator other = (RuleBasedCollator)obj; // all other non-transient information is also contained in rules. - return (m_rules_.equals(other.m_rules_)); - } - - /** - * Standard override; no change in semantics. - * @draft 2.2 - */ - public Object clone() { - // synwee todo: do after all implementation done - return null; + return getStrength() == other.getStrength() + && getDecomposition() == other.getDecomposition() + && other.m_caseFirst_ == m_caseFirst_ + && other.m_caseSwitch_ == m_caseSwitch_ + && other.m_isAlternateHandlingShifted_ + == m_isAlternateHandlingShifted_ + && other.m_isCaseLevel_ == m_isCaseLevel_ + && other.m_isFrenchCollation_ == m_isFrenchCollation_ + && other.m_isHiragana4_ == m_isHiragana4_ + && m_rules_.equals(other.m_rules_); } /** - * Generates the hash code for this RuleBasedCollator. + * Generates a unique hash code for this RuleBasedCollator. * @return the unique hash code for this Collator * @draft 2.2 */ - public final int hashCode() + public int hashCode() { return getRules().hashCode(); } /** - *- If argument shifted is true and *
*
*- return value is true, then the alternate handling attribute for - * the Collator is shifted. Or + * the Collator is SHIFTED. Or *
- return value is false, then the alternate handling attribute for - * the Collator is not shifted + * the Collator is NON_IGNORABLE *
- If argument shifted is false and *
*
*- return value is true, then the alternate handling attribute for - * the Collator is non-ignorable. Or + * the Collator is NON_IGNORABLE. Or *
- return value is false, then the alternate handling attribute for - * the Collator is not non-ignorable. + * the Collator is SHIFTED. *
Compares the source string to the target string according to the - * collation rules for this Collator. Returns an integer less than, equal - * to or greater than zero depending on whether the source String is less - * than, equal to or greater than the target string. See the Collator - * class description for an example of use.
- *For a one time comparison, this method has the best performance. If - * a given String will be involved in multiple comparisons, - * CollationKey.compareTo() has the best performance. See the Collator - * class description for an example using CollationKeys.
- * @param source the source string. - * @param target the target string. + * Compares the source text String to the target text String according to + * the collation rules, strength and decomposition mode for this + * RuleBasedCollator. + * Returns an integer less than, + * equal to or greater than zero depending on whether the source String is + * less than, equal to or greater than the target String. See the Collator + * class description for an example of use. + * + *+ * General recommendation:
+ * @param source the source text String. + * @param target the target text String. * @return Returns an integer value. Value is less than zero if source is * less than target, value is zero if source and target are equal, * value is greater than zero if source is greater than target. * @see CollationKey - * @see Collator#getCollationKey + * @see #getCollationKey * @draft 2.2 */ - public final int compare(String source, String target) + public int compare(String source, String target) { if (source == target) { return 0; @@ -753,13 +831,14 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate return 1; } + int strength = getStrength(); // setting up the collator parameters boolean compare[] = {m_isCaseLevel_, true, - m_strength_ >= SECONDARY, - m_strength_ >= TERTIARY, - m_strength_ >= QUATERNARY, - m_strength_ == IDENTICAL + strength >= SECONDARY, + strength >= TERTIARY, + strength >= QUATERNARY, + strength == IDENTICAL }; boolean doFrench = m_isFrenchCollation_ && compare[2]; boolean doShift4 = m_isAlternateHandlingShifted_ && compare[4]; @@ -830,15 +909,13 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } return 0; } - - // public abstract methods ----------------------------------------------- - // protected inner interfaces -------------------------------------------- + // package private inner interfaces -------------------------------------- /** * Attribute values to be used when setting the Collator options - */ - protected static interface AttributeValue + */ + static interface AttributeValue { /** * Indicates that the default attribute value will be used. @@ -848,158 +925,250 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate /** * Primary collation strength */ - static final int PRIMARY_ = 0; - /** - * Secondary collation strength - */ - static final int SECONDARY_ = 1; - /** - * Tertiary collation strength - */ - static final int TERTIARY_ = 2; - /** - * Default collation strength - */ - static final int DEFAULT_STRENGTH_ = TERTIARY; - /** - * Internal use for strength checks in Collation elements - */ - static final int CE_STRENGTH_LIMIT_ = TERTIARY + 1; - /** - * Quaternary collation strength - */ - static final int QUATERNARY_ = 3; - /** - * Identical collation strength - */ - static final int IDENTICAL_ = 15; - /** - * Internal use for strength checks - */ - static final int STRENGTH_LIMIT_ = IDENTICAL + 1; - /** - * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, - * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE - */ - static final int OFF_ = 16; - /** - * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, - * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE - */ - static final int ON_ = 17; + static final int PRIMARY_ = Collator.PRIMARY; + /** + * Secondary collation strength + */ + static final int SECONDARY_ = Collator.SECONDARY; + /** + * Tertiary collation strength + */ + static final int TERTIARY_ = Collator.TERTIARY; + /** + * Default collation strength + */ + static final int DEFAULT_STRENGTH_ = Collator.TERTIARY; + /** + * Internal use for strength checks in Collation elements + */ + static final int CE_STRENGTH_LIMIT_ = Collator.TERTIARY + 1; + /** + * Quaternary collation strength + */ + static final int QUATERNARY_ = 3; + /** + * Identical collation strength + */ + static final int IDENTICAL_ = Collator.IDENTICAL; + /** + * Internal use for strength checks + */ + static final int STRENGTH_LIMIT_ = Collator.IDENTICAL + 1; + /** + * Turn the feature off - works for FRENCH_COLLATION, CASE_LEVEL, + * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE + */ + static final int OFF_ = 16; + /** + * Turn the feature on - works for FRENCH_COLLATION, CASE_LEVEL, + * HIRAGANA_QUATERNARY_MODE and DECOMPOSITION_MODE + */ + static final int ON_ = 17; /** * Valid for ALTERNATE_HANDLING. Alternate handling will be shifted */ - static final int SHIFTED_ = 20; - /** - * Valid for ALTERNATE_HANDLING. Alternate handling will be non - * ignorable - */ - static final int NON_IGNORABLE_ = 21; - /** - * Valid for CASE_FIRST - lower case sorts before upper case - */ - static final int LOWER_FIRST_ = 24; - /** - * Upper case sorts before lower case - */ - static final int UPPER_FIRST_ = 25; - /** - * Valid for NORMALIZATION_MODE ON and OFF are also allowed for this - * attribute - */ - static final int ON_WITHOUT_HANGUL_ = 28; - /** - * Number of attribute values - */ - static final int LIMIT_ = 29; - } - - /** - * Attributes that collation service understands. All the attributes can - * take DEFAULT value, as well as the values specific to each one. - */ - protected static interface Attribute { - /** - * Attribute for direction of secondary weights - used in French. - * Acceptable values are ON, which results in secondary weights being - * considered backwards and OFF which treats secondary weights in the - * order they appear. - */ - static final int FRENCH_COLLATION_ = 0; - /** - * Attribute for handling variable elements. Acceptable values are - * NON_IGNORABLE (default) which treats all the codepoints with - * non-ignorable primary weights in the same way, and SHIFTED which - * causes codepoints with primary weights that are equal or below the - * variable top value to be ignored on primary level and moved to the - * quaternary level. - */ - static final int ALTERNATE_HANDLING_ = 1; - /** - * Controls the ordering of upper and lower case letters. Acceptable - * values are OFF (default), which orders upper and lower case letters - * in accordance to their tertiary weights, UPPER_FIRST which forces - * upper case letters to sort before lower case letters, and - * LOWER_FIRST which does the opposite. - */ - static final int CASE_FIRST_ = 2; - /** - * Controls whether an extra case level (positioned before the third - * level) is generated or not. Acceptable values are OFF (default), - * when case level is not generated, and ON which causes the case - * level to be generated. Contents of the case level are affected by - * the value of CASE_FIRST attribute. A simple way to ignore accent - * differences in a string is to set the strength to PRIMARY and - * enable case level. - */ - static final int CASE_LEVEL_ = 3; - /** - * Controls whether the normalization check and necessary - * normalizations are performed. When set to OFF (default) no - * normalization check is performed. The correctness of the result is - * guaranteed only if the input data is in so-called FCD form (see - * users manual for more info). When set to ON, an incremental check - * is performed to see whether the input data is in the FCD form. If - * the data is not in the FCD form, incremental NFD normalization is - * performed. - */ - static final int NORMALIZATION_MODE_ = 4; - /** - * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, - * QUATERNARY or IDENTICAL. The usual strength for most locales - * (except Japanese) is tertiary. Quaternary strength is useful when - * combined with shifted setting for alternate handling attribute and - * for JIS x 4061 collation, when it is used to distinguish between - * Katakana and Hiragana (this is achieved by setting the - * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is - * affected only by the number of non ignorable code points in the - * string. Identical strength is rarely useful, as it amounts to - * codepoints of the NFD form of the string. - */ - static final int STRENGTH_ = 5; - /** - * When turned on, this attribute positions Hiragana before all - * non-ignorables on quaternary level. This is a sneaky way to produce - * JIS sort order. - */ - static final int HIRAGANA_QUATERNARY_MODE_ = 6; - /** - * Attribute count - */ - static final int LIMIT_ = 7; - } + static final int SHIFTED_ = 20; + /** + * Valid for ALTERNATE_HANDLING. Alternate handling will be non + * ignorable + */ + static final int NON_IGNORABLE_ = 21; + /** + * Valid for CASE_FIRST - lower case sorts before upper case + */ + static final int LOWER_FIRST_ = 24; + /** + * Upper case sorts before lower case + */ + static final int UPPER_FIRST_ = 25; + /** + * Valid for NORMALIZATION_MODE ON and OFF are also allowed for this + * attribute + */ + static final int ON_WITHOUT_HANGUL_ = 28; + /** + * Number of attribute values + */ + static final int LIMIT_ = 29; + }; + + /** + * Attributes that collation service understands. All the attributes can + * take DEFAULT value, as well as the values specific to each one. + */ + static interface Attribute + { + /** + * Attribute for direction of secondary weights - used in French. + * Acceptable values are ON, which results in secondary weights being + * considered backwards and OFF which treats secondary weights in the + * order they appear. + */ + static final int FRENCH_COLLATION_ = 0; + /** + * Attribute for handling variable elements. Acceptable values are + * NON_IGNORABLE (default) which treats all the codepoints with + * non-ignorable primary weights in the same way, and SHIFTED which + * causes codepoints with primary weights that are equal or below the + * variable top value to be ignored on primary level and moved to the + * quaternary level. + */ + static final int ALTERNATE_HANDLING_ = 1; + /** + * Controls the ordering of upper and lower case letters. Acceptable + * values are OFF (default), which orders upper and lower case letters + * in accordance to their tertiary weights, UPPER_FIRST which forces + * upper case letters to sort before lower case letters, and + * LOWER_FIRST which does the opposite. + */ + static final int CASE_FIRST_ = 2; + /** + * Controls whether an extra case level (positioned before the third + * level) is generated or not. Acceptable values are OFF (default), + * when case level is not generated, and ON which causes the case + * level to be generated. Contents of the case level are affected by + * the value of CASE_FIRST attribute. A simple way to ignore accent + * differences in a string is to set the strength to PRIMARY and + * enable case level. + */ + static final int CASE_LEVEL_ = 3; + /** + * Controls whether the normalization check and necessary + * normalizations are performed. When set to OFF (default) no + * normalization check is performed. The correctness of the result is + * guaranteed only if the input data is in so-called FCD form (see + * users manual for more info). When set to ON, an incremental check + * is performed to see whether the input data is in the FCD form. If + * the data is not in the FCD form, incremental NFD normalization is + * performed. + */ + static final int NORMALIZATION_MODE_ = 4; + /** + * The strength attribute. Can be either PRIMARY, SECONDARY, TERTIARY, + * QUATERNARY or IDENTICAL. The usual strength for most locales + * (except Japanese) is tertiary. Quaternary strength is useful when + * combined with shifted setting for alternate handling attribute and + * for JIS x 4061 collation, when it is used to distinguish between + * Katakana and Hiragana (this is achieved by setting the + * HIRAGANA_QUATERNARY mode to on. Otherwise, quaternary level is + * affected only by the number of non ignorable code points in the + * string. Identical strength is rarely useful, as it amounts to + * codepoints of the NFD form of the string. + */ + static final int STRENGTH_ = 5; + /** + * When turned on, this attribute positions Hiragana before all + * non-ignorables on quaternary level. This is a sneaky way to produce + * JIS sort order. + */ + static final int HIRAGANA_QUATERNARY_MODE_ = 6; + /** + * Attribute count + */ + static final int LIMIT_ = 7; + }; - // protected data members ------------------------------------------------ - /** + * DataManipulate singleton + */ + static class DataManipulate implements Trie.DataManipulate + { + // public methods ---------------------------------------------------- + + /** + * Internal method called to parse a lead surrogate's ce for the offset + * to the next trail surrogate data. + * @param ce collation element of the lead surrogate + * @return data offset or 0 for the next trail surrogate + * @draft 2.2 + */ + public final int getFoldingOffset(int ce) + { + if (isSpecial(ce) && getTag(ce) == CE_SURROGATE_TAG_) { + return (ce & 0xFFFFFF); + } + return 0; + } + + /** + * Get singleton object + */ + public static final DataManipulate getInstance() + { + if (m_instance_ == null) { + m_instance_ = new DataManipulate(); + } + return m_instance_; + } + + // private data member ---------------------------------------------- + + /** + * Singleton instance + */ + private static DataManipulate m_instance_; + + // private constructor ---------------------------------------------- + + /** + * private to prevent initialization + */ + private DataManipulate() + { + } + }; + + // package private data member ------------------------------------------- + + static final byte BYTE_FIRST_TAILORED_ = (byte)0x04; + static final byte BYTE_COMMON_ = (byte)0x05; + static final int COMMON_TOP_2_ = 0x86; // int for unsigness + static final int COMMON_BOTTOM_2_ = BYTE_COMMON_; + /** + * Case strength mask + */ + static final int CE_CASE_BIT_MASK_ = 0xC0; + static final int CE_TAG_SHIFT_ = 24; + static final int CE_TAG_MASK_ = 0x0F000000; + + static final int CE_SPECIAL_FLAG_ = 0xF0000000; + /** + * Lead surrogate that is tailored and doesn't start a contraction + */ + static final int CE_SURROGATE_TAG_ = 5; + /** + * Mask to get the primary strength of the collation element + */ + static final int CE_PRIMARY_MASK_ = 0xFFFF0000; + /** + * Mask to get the secondary strength of the collation element + */ + static final int CE_SECONDARY_MASK_ = 0xFF00; + /** + * Mask to get the tertiary strength of the collation element + */ + static final int CE_TERTIARY_MASK_ = 0xFF; + /** + * Primary strength shift + */ + static final int CE_PRIMARY_SHIFT_ = 16; + /** + * Secondary strength shift + */ + static final int CE_SECONDARY_SHIFT_ = 8; + /** + * Continuation marker + */ + static final int CE_CONTINUATION_MARKER_ = 0xC0; + + /** * Size of collator raw data headers and options before the expansion * data. This is used when expansion ces are to be retrieved. ICU4C uses * the expansion offset starting from UCollator.UColHeader, hence ICU4J * will have to minus that off to get the right expansion ce offset. In * number of ints. */ - protected int m_expansionOffset_; + int m_expansionOffset_; /** * Size of collator raw data headers, options and expansions before * contraction data. This is used when contraction ces are to be retrieved. @@ -1007,63 +1176,63 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * ICU4J will have to minus that off to get the right contraction ce * offset. In number of chars. */ - protected int m_contractionOffset_; + int m_contractionOffset_; /** * Flag indicator if Jamo is special */ - protected boolean m_isJamoSpecial_; + boolean m_isJamoSpecial_; // Collator options ------------------------------------------------------ - protected int m_defaultVariableTopValue_; - protected boolean m_defaultIsFrenchCollation_; - protected boolean m_defaultIsAlternateHandlingShifted_; - protected int m_defaultCaseFirst_; - protected boolean m_defaultIsCaseLevel_; - protected int m_defaultDecomposition_; - protected int m_defaultStrength_; - protected boolean m_defaultIsHiragana4_; + int m_defaultVariableTopValue_; + boolean m_defaultIsFrenchCollation_; + boolean m_defaultIsAlternateHandlingShifted_; + int m_defaultCaseFirst_; + boolean m_defaultIsCaseLevel_; + int m_defaultDecomposition_; + int m_defaultStrength_; + boolean m_defaultIsHiragana4_; /** * Value of the variable top */ - protected int m_variableTopValue_; + int m_variableTopValue_; /** * Attribute for special Hiragana */ - protected boolean m_isHiragana4_; + boolean m_isHiragana4_; /** * Case sorting customization */ - protected int m_caseFirst_; + int m_caseFirst_; // end Collator options -------------------------------------------------- - + /** * Expansion table */ - protected int m_expansion_[]; + int m_expansion_[]; /** * Contraction index table */ - protected char m_contractionIndex_[]; + char m_contractionIndex_[]; /** * Contraction CE table */ - protected int m_contractionCE_[]; + int m_contractionCE_[]; /** * Data trie */ - protected IntTrie m_trie_; + IntTrie m_trie_; /** * Table to store all collation elements that are the last element of an * expansion. This is for use in StringSearch. */ - protected int m_expansionEndCE_[]; + int m_expansionEndCE_[]; /** * Table to store the maximum size of any expansions that end with the * corresponding collation element in m_expansionEndCE_. For use in * StringSearch too */ - protected byte m_expansionEndCEMaxSize_[]; + byte m_expansionEndCEMaxSize_[]; /** * Heuristic table to store information on whether a char character is * considered "unsafe". "Unsafe" character are combining marks or those @@ -1072,33 +1241,33 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * unsafe. If we have another contraction "ZA" with the one above, then * 'A', 'B', 'C' are "unsafe" but 'Z' is not. */ - protected byte m_unsafe_[]; + byte m_unsafe_[]; /** * Table to store information on whether a codepoint can occur as the last * character in a contraction */ - protected byte m_contractionEnd_[]; + byte m_contractionEnd_[]; /** - * Table for UCA use, may be removed + * Table for UCA and builder use */ - protected char m_UCAContraction_[]; + char m_UCAContraction_[]; /** * Original collation rules */ - protected String m_rules_; + String m_rules_; /** * The smallest "unsafe" codepoint */ - protected char m_minUnsafe_; + char m_minUnsafe_; /** * The smallest codepoint that could be the end of a contraction */ - protected char m_minContractionEnd_; + char m_minContractionEnd_; /** * UnicodeData.txt property object */ - protected static final RuleBasedCollator UCA_; + static final RuleBasedCollator UCA_; // block to initialise character property database static @@ -1124,99 +1293,12 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate e.printStackTrace(); throw new RuntimeException(e.getMessage()); } - } + } - // protected constants --------------------------------------------------- + // package private constructors ------------------------------------------ - protected static final int CE_SPECIAL_FLAG_ = 0xF0000000; - /** - * Lead surrogate that is tailored and doesn't start a contraction - */ - protected static final int CE_SURROGATE_TAG_ = 5; - - /** - * Minimum size required for the binary collation data in bytes. - * Size of UCA header + size of options to 4 bytes - */ - private static final int MIN_BINARY_DATA_SIZE_ = (41 + 8) << 2; - /** - * Mask to get the primary strength of the collation element - */ - protected static final int CE_PRIMARY_MASK_ = 0xFFFF0000; - /** - * Mask to get the secondary strength of the collation element - */ - protected static final int CE_SECONDARY_MASK_ = 0xFF00; - /** - * Mask to get the tertiary strength of the collation element - */ - protected static final int CE_TERTIARY_MASK_ = 0xFF; - /** - * Primary strength shift - */ - protected static final int CE_PRIMARY_SHIFT_ = 16; - /** - * Secondary strength shift - */ - protected static final int CE_SECONDARY_SHIFT_ = 8; - - /** - * Continuation marker - */ - protected static final int CE_CONTINUATION_MARKER_ = 0xC0; - - // end protected constants ----------------------------------------------- - - // protected constructor ------------------------------------------------- - - /** - * Constructors a RuleBasedCollator from the argument locale. - * If no resource bundle is associated with the locale, UCA is used - * instead. - * @param locale - * @exception Exception thrown when there's an error creating the Collator - */ - protected RuleBasedCollator(Locale locale) throws Exception - { - ResourceBundle rb = ICULocaleData.getLocaleElements(locale); - - if (rb != null) { - byte map[] = (byte [])rb.getObject("%%CollationBin"); - // synwee todo: problem, data in little endian and - // ICUListResourceBundle should not calculate size by - // using .available() that only gives the buffer size - BufferedInputStream input = - new BufferedInputStream(new ByteArrayInputStream(map)); - CollatorReader reader = new CollatorReader(input, false); - if (map.length > MIN_BINARY_DATA_SIZE_) { - // synwee todo: undo when problem solved - reader.read(this); - } - else { - reader.readHeader(this); - reader.readOptions(this); - // duplicating UCA_'s data - m_expansion_ = UCA_.m_expansion_; - m_contractionIndex_ = UCA_.m_contractionIndex_; - m_contractionCE_ = UCA_.m_contractionCE_; - m_trie_ = UCA_.m_trie_; - m_expansionEndCE_ = UCA_.m_expansionEndCE_; - m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_; - m_unsafe_ = UCA_.m_unsafe_; - m_contractionEnd_ = UCA_.m_contractionEnd_; - m_minUnsafe_ = UCA_.m_minUnsafe_; - m_minContractionEnd_ = UCA_.m_minContractionEnd_; - } - Object rules = rb.getObject("CollationElements"); - if (rules != null) { - m_rules_ = (String)((Object[][])rules)[0][1]; - } - init(); - } - } - - /** - *
+ * If comparison are to be done to the same String multiple times, it would + * be more efficient to generate CollationKeys for the Strings and use + * CollationKey.compareTo(CollationKey) for the comparisons. + * If the each Strings are compared to only once, using the method + * RuleBasedCollator.compare(String, String) will have a better performance. + *Protected constructor for use by subclasses. + /** + *
Private contructor for use by subclasses. * Public access to creating Collators is handled by the API * Collator.getInstance() or RuleBasedCollator(String rules). *
@@ -1225,41 +1307,68 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * * @draft 2.2 */ - protected RuleBasedCollator() throws Exception + RuleBasedCollator() { } - - // protected methods ----------------------------------------------------- + + // package private methods ----------------------------------------------- /** - * Initializes the RuleBasedCollator + * Sets this collator to use the tables in UCA. Note options not taken + * care of here. */ - protected final void init() + final void setWithUCATables() { - for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; - m_minUnsafe_ ++) { - // Find the smallest unsafe char. - if (isUnsafe(m_minUnsafe_)) { - break; - } - } - - for (m_minContractionEnd_ = 0; - m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; - m_minContractionEnd_ ++) { - // Find the smallest contraction-ending char. - if (isContractionEnd(m_minContractionEnd_)) { - break; - } - } - m_strength_ = m_defaultStrength_; - m_decomposition_ = m_defaultDecomposition_; - m_isFrenchCollation_ = m_defaultIsFrenchCollation_; - m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; - m_isCaseLevel_ = m_defaultIsCaseLevel_; - m_caseFirst_ = m_defaultCaseFirst_; - m_isHiragana4_ = m_defaultIsHiragana4_; - updateInternalState(); + m_expansion_ = UCA_.m_expansion_; + m_contractionIndex_ = UCA_.m_contractionIndex_; + m_contractionCE_ = UCA_.m_contractionCE_; + m_trie_ = UCA_.m_trie_; + m_expansionEndCE_ = UCA_.m_expansionEndCE_; + m_expansionEndCEMaxSize_ = UCA_.m_expansionEndCEMaxSize_; + m_unsafe_ = UCA_.m_unsafe_; + m_contractionEnd_ = UCA_.m_contractionEnd_; + m_minUnsafe_ = UCA_.m_minUnsafe_; + m_minContractionEnd_ = UCA_.m_minContractionEnd_; + } + + /** + * Sets this collator to use the all options and tables in UCA. + */ + final void setWithUCAData() + { + m_addition3_ = UCA_.m_addition3_; + m_bottom3_ = UCA_.m_bottom3_; + m_bottomCount3_ = UCA_.m_bottomCount3_; + m_caseFirst_ = UCA_.m_caseFirst_; + m_caseSwitch_ = UCA_.m_caseSwitch_; + m_common3_ = UCA_.m_common3_; + m_contractionOffset_ = UCA_.m_contractionOffset_; + setDecomposition(UCA_.getDecomposition()); + m_defaultCaseFirst_ = UCA_.m_defaultCaseFirst_; + m_defaultDecomposition_ = UCA_.m_defaultDecomposition_; + m_defaultIsAlternateHandlingShifted_ + = UCA_.m_defaultIsAlternateHandlingShifted_; + m_defaultIsCaseLevel_ = UCA_.m_defaultIsCaseLevel_; + m_defaultIsFrenchCollation_ = UCA_.m_defaultIsFrenchCollation_; + m_defaultIsHiragana4_ = UCA_.m_defaultIsHiragana4_; + m_defaultStrength_ = UCA_.m_defaultStrength_; + m_defaultVariableTopValue_ = UCA_.m_defaultVariableTopValue_; + m_expansionOffset_ = UCA_.m_expansionOffset_; + m_isAlternateHandlingShifted_ = UCA_.m_isAlternateHandlingShifted_; + m_isCaseLevel_ = UCA_.m_isCaseLevel_; + m_isFrenchCollation_ = UCA_.m_isFrenchCollation_; + m_isHiragana4_ = UCA_.m_isHiragana4_; + m_isJamoSpecial_ = UCA_.m_isJamoSpecial_; + m_isSimple3_ = UCA_.m_isSimple3_; + m_mask3_ = UCA_.m_mask3_; + m_minContractionEnd_ = UCA_.m_minContractionEnd_; + m_minUnsafe_ = UCA_.m_minUnsafe_; + m_rules_ = UCA_.m_rules_; + setStrength(UCA_.getStrength()); + m_top3_ = UCA_.m_top3_; + m_topCount3_ = UCA_.m_topCount3_; + m_variableTopValue_ = UCA_.m_variableTopValue_; + setWithUCATables(); } /** @@ -1272,7 +1381,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * @param ch character to determin * @return true if ch is unsafe, false otherwise */ - protected final boolean isUnsafe(char ch) + final boolean isUnsafe(char ch) { if (ch < m_minUnsafe_) { return false; @@ -1296,7 +1405,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * otherwise it is not deterministic. * @param ch character to be determined */ - protected final boolean isContractionEnd(char ch) + final boolean isContractionEnd(char ch) { if (UTF16.isTrailSurrogate(ch)) { return true; @@ -1315,93 +1424,11 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Resets the internal case data members and compression values. - */ - protected void updateInternalState() - { - if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { - m_caseSwitch_ = (byte)CASE_SWITCH_; - } - else { - m_caseSwitch_ = NO_CASE_SWITCH_; - } - - if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) { - m_mask3_ = CE_REMOVE_CASE_; - m_common3_ = COMMON_NORMAL_3_; - m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_; - m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_; - m_bottom3_ = COMMON_BOTTOM_3_; - } - else { - m_mask3_ = (byte)CE_KEEP_CASE_; - m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_; - if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { - m_common3_ = COMMON_UPPER_FIRST_3_; - m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_; - m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_; - } else { - m_common3_ = COMMON_NORMAL_3_; - m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_; - m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_; - } - } - - // Set the compression values - int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1; - // we multilply double with int, but need only int - m_topCount3_ = (int)(PROPORTION_3_ * total3); - m_bottomCount3_ = total3 - m_topCount3_; - - if (!m_isCaseLevel_ && m_strength_ == AttributeValue.TERTIARY_ - && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) { - m_isSimple3_ = true; - } - else { - m_isSimple3_ = false; - } - } - - /** - *Converts the C attribute index and values for use and stores it into - * the relevant default attribute variable.
- *Note internal use, no sanity checks done on arguments
- */ - protected void setAttributeDefault(int attribute, int value) - { - switch (attribute) { - case Attribute.FRENCH_COLLATION_: - m_defaultIsFrenchCollation_ = (value == AttributeValue.ON_); - break; - case Attribute.ALTERNATE_HANDLING_: - m_defaultIsAlternateHandlingShifted_ = - (value == AttributeValue.SHIFTED_); - break; - case Attribute.CASE_FIRST_: - m_defaultCaseFirst_ = value; - break; - case Attribute.CASE_LEVEL_: - m_defaultIsCaseLevel_ = (value == AttributeValue.ON_); - break; - case Attribute.NORMALIZATION_MODE_: - if (value == AttributeValue.ON_) { - value = Collator.CANONICAL_DECOMPOSITION; - } - m_defaultDecomposition_ = value; - break; - case Attribute.STRENGTH_: - m_defaultStrength_ = value; - case Attribute.HIRAGANA_QUATERNARY_MODE_: - m_defaultIsHiragana4_ = (value == AttributeValue.ON_); - } - } - - /** * Retrieve the tag of a special ce * @param ce ce to test * @return tag of ce */ - protected static int getTag(int ce) + static int getTag(int ce) { return (ce & CE_TAG_MASK_) >> CE_TAG_SHIFT_; } @@ -1411,60 +1438,62 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * @param ce to check * @return true if ce is special */ - protected static boolean isSpecial(int ce) + static boolean isSpecial(int ce) { return (ce & CE_SPECIAL_FLAG_) == CE_SPECIAL_FLAG_; } - - /** - * Getting the mask for collation strength - * @param strength collation strength - * @return collation element mask - */ - protected static final int getMask(int strength) - { - switch (strength) - { - case Collator.PRIMARY: - return CE_PRIMARY_MASK_; - case Collator.SECONDARY: - return CE_SECONDARY_MASK_ | CE_PRIMARY_MASK_; - default: - return CE_TERTIARY_MASK_ | CE_SECONDARY_MASK_ - | CE_PRIMARY_MASK_; - } - } - /** - * Gets the primary weights from a CE - * @param ce collation element - * @return the primary weight of the collation element - */ - protected static final int getPrimaryWeight(int ce) - { - return ((ce) & CE_PRIMARY_MASK_) >> CE_PRIMARY_SHIFT_; - } - - /** - * Gets the secondary weights from a CE - * @param ce collation element - * @return the secondary weight of the collation element - */ - protected static final int getSecondaryWeight(int ce) - { - return (ce & CE_SECONDARY_MASK_) >> CE_SECONDARY_SHIFT_; - } - - /** - * Gets the tertiary weights from a CE - * @param ce collation element - * @return the tertiary weight of the collation element - */ - protected static final int getTertiaryWeight(int ce) - { - return ce & CE_TERTIARY_MASK_; - } - + /** + * Checks if the argument ce is a continuation + * @param ce collation element to test + * @return true if ce is a continuation + */ + static final boolean isContinuation(int ce) + { + return ce != CollationElementIterator.NULLORDER + && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_; + } + + // protected constructor ------------------------------------------------- + + /** + * Constructors a RuleBasedCollator from the argument locale. + * If no resource bundle is associated with the locale, UCA is used + * instead. + * @param locale + * @exception Exception thrown when there's an error creating the Collator + */ + RuleBasedCollator(Locale locale) throws Exception + { + ResourceBundle rb = ICULocaleData.getLocaleElements(locale); + + if (rb != null) { + byte map[] = (byte [])rb.getObject("%%CollationBin"); + BufferedInputStream input = + new BufferedInputStream(new ByteArrayInputStream(map)); + CollatorReader reader = new CollatorReader(input, false); + if (map.length > MIN_BINARY_DATA_SIZE_) { + reader.read(this); + } + else { + reader.readHeader(this); + reader.readOptions(this); + // duplicating UCA_'s data + setWithUCATables(); + } + Object rules = rb.getObject("CollationElements"); + if (rules != null) { + m_rules_ = (String)((Object[][])rules)[0][1]; + } + init(); + } + else { + setWithUCAData(); + } + } + + // private inner classes ------------------------------------------------ + // private variables ----------------------------------------------------- /** @@ -1528,7 +1557,6 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate /** * Case strength mask */ - private static final int CE_CASE_BIT_MASK_ = 0xC0; private static final int CE_CASE_MASK_3_ = 0xFF; /** * Sortkey size factor. Values can be changed. @@ -1547,14 +1575,10 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate private static final byte BYTE_SORTKEY_GLUE_ = (byte)0x02; private static final byte BYTE_SHIFT_PREFIX_ = (byte)0x03; private static final byte BYTE_UNSHIFTED_MIN_ = BYTE_SHIFT_PREFIX_; - private static final byte BYTE_FIRST_TAILORED_ = (byte)0x04; - private static final byte BYTE_COMMON_ = (byte)0x05; private static final byte BYTE_FIRST_UCA_ = BYTE_COMMON_; private static final byte BYTE_LAST_LATIN_PRIMARY_ = (byte)0x4C; private static final byte BYTE_FIRST_NON_LATIN_PRIMARY_ = (byte)0x4D; private static final byte BYTE_UNSHIFTED_MAX_ = (byte)0xFF; - private static final int COMMON_BOTTOM_2_ = BYTE_COMMON_; - private static final int COMMON_TOP_2_ = 0x86; // int for unsigness private static final int TOTAL_2_ = COMMON_TOP_2_ - COMMON_BOTTOM_2_ - 1; private static final int FLAG_BIT_MASK_CASE_SWITCH_OFF_ = 0x80; private static final int FLAG_BIT_MASK_CASE_SWITCH_ON_ = 0x40; @@ -1572,6 +1596,12 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate private static final int COMMON_NORMAL_3_ = COMMON_BOTTOM_3_; private static final int COMMON_4_ = (byte)0xFF; + /** + * Minimum size required for the binary collation data in bytes. + * Size of UCA header + size of options to 4 bytes + */ + private static final int MIN_BINARY_DATA_SIZE_ = (41 + 8) << 2; + /** * If this collator is to generate only simple tertiaries for fast path */ @@ -1582,7 +1612,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate */ private boolean m_isFrenchCollation_; /** - * Flag indicating if shifted is requested for quartenary alternate + * Flag indicating if shifted is requested for Quaternary alternate * handling. If this is not true, the default for alternate handling will * be non-ignorable. */ @@ -1591,9 +1621,6 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * Extra case level for sorting */ private boolean m_isCaseLevel_; - - private static final int CE_TAG_SHIFT_ = 24; - private static final int CE_TAG_MASK_ = 0x0F000000; private static final int SORT_BUFFER_INIT_SIZE_ = 128; private static final int SORT_BUFFER_INIT_SIZE_1_ = @@ -1621,19 +1648,8 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * CE buffer size */ private static final int CE_BUFFER_SIZE_ = 512; - - // private methods ------------------------------------------------------- - /** - * Checks if the argument ce is a continuation - * @param ce collation element to test - * @return true if ce is a continuation - */ - private static final boolean isContinuation(int ce) - { - return ce != CollationElementIterator.NULLORDER - && (ce & CE_CONTINUATION_TAG_) == CE_CONTINUATION_TAG_; - } + // private methods ------------------------------------------------------- /** * Gets the 2 bytes of primary order and adds it to the primary byte array @@ -1645,8 +1661,8 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * a continuation ce * @param doShift flag indicating if ce is to be shifted * @param leadPrimary lead primary used for compression - * @param commonBottom4 common byte value for quartenary - * @param bottomCount4 smallest byte value for quartenary + * @param commonBottom4 common byte value for Quaternary + * @param bottomCount4 smallest byte value for Quaternary * @return the new lead primary for compression */ private final int doPrimaryBytes(int ce, byte bytes[][], int bytescount[], @@ -1656,7 +1672,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate { int p2 = (ce >>= 16) & LAST_BYTE_MASK_; // in ints for unsigned - int p1 = (ce >> 8) & LAST_BYTE_MASK_; // comparison + int p1 = ce >>> 8; // comparison if (doShift) { if (count[4] > 0) { while (count[4] > bottomCount4) { @@ -1931,17 +1947,17 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } /** - * Gets the quartenary byte and adds it to the quartenary byte array + * Gets the Quaternary byte and adds it to the Quaternary byte array * @param bytes array of byte arrays for each strength * @param bytescount array of the size of each strength byte arrays * @param count array of counters for each of the strength * @param isCodePointHiragana flag indicator if the previous codepoint * we dealt with was Hiragana - * @param commonBottom4 smallest common quartenary byte - * @param bottomCount4 smallest quartenary byte - * @param hiragana4 hiragana quartenary byte + * @param commonBottom4 smallest common Quaternary byte + * @param bottomCount4 smallest Quaternary byte + * @param hiragana4 hiragana Quaternary byte */ - private final void doQuartenaryBytes(byte bytes[][], int bytescount[], + private final void doQuaternaryBytes(byte bytes[][], int bytescount[], int count[], boolean isCodePointHiragana, int commonBottom4, int bottomCount4, @@ -1985,9 +2001,10 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate int count[], boolean doFrench, byte hiragana4, int commonBottom4, int bottomCount4) + { - int backupDecomposition = m_decomposition_; - m_decomposition_ = NO_DECOMPOSITION; // have to revert to backup later + int backupDecomposition = getDecomposition(); + setDecomposition(NO_DECOMPOSITION); // have to revert to backup later CollationElementIterator coleiter = new CollationElementIterator(source, this); @@ -2053,12 +2070,12 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } if (compare[4] && notIsContinuation) { // compare quad - doQuartenaryBytes(bytes, bytescount, count, + doQuaternaryBytes(bytes, bytescount, count, coleiter.m_isCodePointHiragana_, commonBottom4, bottomCount4, hiragana4); } } - m_decomposition_ = backupDecomposition; // reverts to original + setDecomposition(backupDecomposition); // reverts to original if (frenchOffset[0] != -1) { // one last round of checks reverseBuffer(bytes[2], frenchOffset); @@ -2117,14 +2134,16 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate private final void doFrench(byte bytes[][], int bytescount[], int count[]) { for (int i = 0; i < bytescount[2]; i ++) { - byte s = bytes[2][bytescount[2] - i - 1]; + byte s = bytes[2][bytescount[2] - i - 1]; // This is compression code. if (s == COMMON_2_) { ++ count[2]; } else { if (count[2] > 0) { - if (s > COMMON_2_) { // not necessary for 4th level. + // getting the unsigned value + if ((s & LAST_BYTE_MASK_) > COMMON_2_) { + // not necessary for 4th level. while (count[2] > TOP_COUNT_2_) { append(bytes, bytescount, 1, (byte)(COMMON_TOP_2_ - TOP_COUNT_2_)); @@ -2336,21 +2355,38 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate private final int getFirstUnmatchedOffset(String source, String target) { int result = 0; - int minlength = source.length(); - if (minlength > target.length()) { - minlength = target.length(); + int slength = source.length(); + int tlength = target.length(); + int minlength = slength; + if (minlength > tlength) { + minlength = tlength; } while (result < minlength && source.charAt(result) == target.charAt(result)) { result ++; } - if (result > 0 && result < minlength) { + if (result > 0) { // There is an identical portion at the beginning of the two // strings. If the identical portion ends within a contraction or a // combining character sequence, back up to the start of that - // sequence. - char schar = source.charAt(result); // first differing chars - char tchar = target.charAt(result); + // sequence. + char schar = 0; + char tchar = 0; + if (result < minlength) { + schar = source.charAt(result); // first differing chars + tchar = target.charAt(result); + } + else { + if (slength == tlength) { + return result; + } + else if (slength < tlength) { + tchar = target.charAt(result); + } + else { + schar = source.charAt(result); + } + } if (isUnsafe(schar) || isUnsafe(tchar)) { // We are stopped in the middle of a contraction or combining @@ -2394,9 +2430,10 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * to compare. It is used when compare gets in trouble and needs to bail * out. * @param source text string - * @param target text string + * @param target text string */ private final int compareBySortKeys(String source, String target) + { CollationKey sourcekey = getCollationKey(source); CollationKey targetkey = getCollationKey(target); @@ -2432,6 +2469,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate String source, String target, int textoffset, int cebuffer[][], int cebuffersize[]) + { // Preparing the context objects for iterating over strings StringCharacterIterator siter = new StringCharacterIterator(source, @@ -2574,6 +2612,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate CollationElementIterator coleiter, int lowestpvalue, int cebuffer[][], int cebuffersize[], int cebufferindex) + { boolean shifted = false; int result = CollationElementIterator.IGNORABLE; @@ -2966,21 +3005,18 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate */ private static final int doIdenticalCompare(String source, String target, int offset, boolean normalize) + { if (normalize) { - /* - if (unorm_quickCheck(sColl->string, sLen, UNORM_NFD) != UNORM_YES) { - source = unorm_decompose(sColl->writableBuffer, - sColl->writableBufSize, - sBuf, sLen, FALSE, FALSE); + if (Normalizer.quickCheck(source, Normalizer.NFD) + != Normalizer.YES) { + source = Normalizer.decompose(source, false); } - if (unorm_quickCheck(tColl->string, tLen, UNORM_NFD) != UNORM_YES) { - target = unorm_decompose(tColl->writableBuffer, - tColl->writableBufSize, - tBuf, tLen, FALSE, FALSE); + if (Normalizer.quickCheck(target, Normalizer.NFD) + != Normalizer.YES) { + target = Normalizer.decompose(target, false); } - */ offset = 0; } @@ -3003,15 +3039,25 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate // compare identical prefixes - they do not need to be fixed up char schar = 0; char tchar = 0; - while (true) { + int slength = source.length(); + int tlength = target.length(); + int minlength = Math.min(slength, tlength); + while (offset < minlength) { schar = source.charAt(offset); tchar = target.charAt(offset ++); if (schar != tchar) { break; } - if (schar == 0) { - return 0; - } + } + + if (schar == tchar && offset == minlength) { + if (slength > minlength) { + return 1; + } + if (tlength > minlength) { + return -1; + } + return 0; } // if both values are in or above the surrogate range, Fix them up. @@ -3046,6 +3092,7 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate * @return true if source after offset is ignorable. false otherwise */ private final boolean checkIgnorable(String source, int offset) + { StringCharacterIterator siter = new StringCharacterIterator(source, offset, source.length(), offset); @@ -3060,4 +3107,83 @@ public class RuleBasedCollator extends Collator implements Trie.DataManipulate } return true; } + + /** + * Resets the internal case data members and compression values. + */ + private void updateInternalState() + { + if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { + m_caseSwitch_ = (byte)CASE_SWITCH_; + } + else { + m_caseSwitch_ = NO_CASE_SWITCH_; + } + + if (m_isCaseLevel_ || m_caseFirst_ == AttributeValue.OFF_) { + m_mask3_ = CE_REMOVE_CASE_; + m_common3_ = COMMON_NORMAL_3_; + m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_OFF_; + m_top3_ = COMMON_TOP_CASE_SWITCH_OFF_3_; + m_bottom3_ = COMMON_BOTTOM_3_; + } + else { + m_mask3_ = (byte)CE_KEEP_CASE_; + m_addition3_ = FLAG_BIT_MASK_CASE_SWITCH_ON_; + if (m_caseFirst_ == AttributeValue.UPPER_FIRST_) { + m_common3_ = COMMON_UPPER_FIRST_3_; + m_top3_ = COMMON_TOP_CASE_SWITCH_UPPER_3_; + m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_UPPER_3_; + } else { + m_common3_ = COMMON_NORMAL_3_; + m_top3_ = COMMON_TOP_CASE_SWITCH_LOWER_3_; + m_bottom3_ = COMMON_BOTTOM_CASE_SWITCH_LOWER_3_; + } + } + + // Set the compression values + int total3 = m_top3_ - COMMON_BOTTOM_3_ - 1; + // we multilply double with int, but need only int + m_topCount3_ = (int)(PROPORTION_3_ * total3); + m_bottomCount3_ = total3 - m_topCount3_; + + if (!m_isCaseLevel_ && getStrength() == AttributeValue.TERTIARY_ + && !m_isFrenchCollation_ && !m_isAlternateHandlingShifted_) { + m_isSimple3_ = true; + } + else { + m_isSimple3_ = false; + } + } + + /** + * Initializes the RuleBasedCollator + */ + private final void init() + { + for (m_minUnsafe_ = 0; m_minUnsafe_ < DEFAULT_MIN_HEURISTIC_; + m_minUnsafe_ ++) { + // Find the smallest unsafe char. + if (isUnsafe(m_minUnsafe_)) { + break; + } + } + + for (m_minContractionEnd_ = 0; + m_minContractionEnd_ < DEFAULT_MIN_HEURISTIC_; + m_minContractionEnd_ ++) { + // Find the smallest contraction-ending char. + if (isContractionEnd(m_minContractionEnd_)) { + break; + } + } + setStrength(m_defaultStrength_); + setDecomposition(m_defaultDecomposition_); + m_isFrenchCollation_ = m_defaultIsFrenchCollation_; + m_isAlternateHandlingShifted_ = m_defaultIsAlternateHandlingShifted_; + m_isCaseLevel_ = m_defaultIsCaseLevel_; + m_caseFirst_ = m_defaultCaseFirst_; + m_isHiragana4_ = m_defaultIsHiragana4_; + updateInternalState(); + } } diff --git a/icu4j/src/com/ibm/icu/text/SearchIterator.java b/icu4j/src/com/ibm/icu/text/SearchIterator.java index befa1290554..2f8dce21da3 100755 --- a/icu4j/src/com/ibm/icu/text/SearchIterator.java +++ b/icu4j/src/com/ibm/icu/text/SearchIterator.java @@ -5,423 +5,715 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SearchIterator.java,v $ - * $Date: 2002/04/03 19:13:56 $ - * $Revision: 1.6 $ + * $Date: 2002/06/21 23:56:48 $ + * $Revision: 1.7 $ * ***************************************************************************************** */ package com.ibm.icu.text; -import java.text.BreakIterator; import java.text.CharacterIterator; /** - *SearchIterator
is an abstract base class that provides methods - * to search for a pattern within a text string. Instances of - *SearchIterator
maintain a current position and scan over - * the target text, returning the indices the pattern is matched - * and the length of each match. *- *
*SearchIterator
is an abstract base class that defines a - * protocol for text searching. Subclasses provide concrete implementations of - * various search algorithms. For example, {@link StringSearch} - * implements language-sensitive pattern matching based on the comparison rules - * defined in a {@link java.text.RuleBasedCollator RuleBasedCollator} object. + * SearchIterator is an abstract base class that defines a protocol for text + * searching. Subclasses provide concrete implementations of various search + * algorithms. The concrete subclass, StringSearch, is provided and implements + * language-sensitive pattern matching based on the comparison rules defined in + * a RuleBasedCollator object. Instances of SearchIterator maintain a current + * position and scan over the target text, returning the indices where a + * matched is found and the length of each match. Generally, the sequence of + * forward matches will be equivalent to the sequence of backward matches. + *- * Internally,
*SearchIterator
scans text using a - * {@link CharacterIterator}, and is thus able to scan text held - * by any object implementing that protocol. AStringCharacterIterator
- * is used to scanString
objects passed tosetText
. + * Internally, SearchIterator scans text using a CharacterIterator, and is thus + * able to scan text held by any object implementing that protocol. + *- *
SearchIterator
provides an API that is similar to that of - * other text iteration classes such asBreakIterator
. Using this - * class, it is easy to scan through text looking for all occurances of a - * given pattern. The following example uses aStringSearch
object to - * find all instances of "fox" in the target string. Any other subclass of - *SearchIterator
can be used in an identical manner. - *- * - * @see StringSearch + * + * + * @author Laura Werner, synwee + * @since release 1.0 + * @draft release 2.2 + * @see BreakIterator */ -public abstract class SearchIterator { +public abstract class SearchIterator +{ + + // public data members ------------------------------------------------- + /** - * DONE is returned by previous() and next() after all valid - * matches have been returned, and by first() and last() if - * there are no matches at all. + * DONE is returned by previous() and next() after all valid matches have + * been returned, and by first() and last() if there are no matches at all. + * @see #previous + * @see #next */ public static final int DONE = -1; - /** - * Private value indicating that the iterator is pointing - * before the beginning of the target text. - */ - private static final int BEFORE = -2; - - /** - * Return the first index at which the target text matches the search - * pattern. The iterator is adjusted so that its current index - * (as returned by {@link #getIndex}) is the match posisition if one was found - * and+ * If logical matches are required, BreakIterators can be used to define the + * boundaries of a logical match. For instance the pattern "e" will + * not be found in the string "\u00e9" if a CharacterBreakIterator is used. + * By default, the SearchIterator does not impose any logic matches, it will + * return any result that matches the pattern. Illustrating with the above + * example, "e" will be found in the string "\u00e9" if no BreakIterator is + * specified. + * + *
+ * SearchIterator also provides means to handle overlapping matches via the + * API setOverlapping(boolean). For example, if the overlapping mode is set, + * searching for the pattern "abab" in the text "ababab" will yield the results + * 0 and 2, where else if overlapping is not set, SearchIterator will only + * produce the result of 0. By default the overlapping mode is not set. + *
+ *+ * The APIs in SearchIterator is similar to that of other text iteration + * classes such as the BreakIterator. Using this class, it is easy to + * scan through text looking for all occurances of a match. The + * following example uses a StringSearch object to find all instances of + * "fox" in the target string. Any other subclass of SearchIterator can be + * used in an identical manner. + *
+ *+ * Example of use:
+ ** String target = "The quick brown fox jumped over the lazy fox"; * String pattern = "fox"; - * * SearchIterator iter = new StringSearch(pattern, target); - * - * for (int pos = iter.first(); pos != SearchIterator.DONE; pos = iter.next()) { - * System.out.println("Found match at " + pos + - * ", length is " + iter.getMatchLength()); + * for (int pos = iter.first(); pos != SearchIterator.DONE; + * pos = iter.next()) { + * System.out.println("Found match at " + pos + ", length is " + * + iter.getMatchLength()); * } - *DONE
if one was not. - * - * @return The character index of the first match, orDONE
if there - * are no matches. - */ - final public int first() { - setIndex(BEFORE); - return next(); - } - - /** - * Return the first index greater than pos at which the target - * text matches the search pattern. The iterator is adjusted so that its current index - * (as returned by {@link #getIndex}) is the match posisition if one was found - * andDONE
if one was not. - * - * @return The character index of the first match followingpos
, - * or DONE if there are no matches. - */ - final public int following(int pos) { - setIndex(pos); - return next(); - } + // public methods ----------------------------------------------------- + + // public setters ----------------------------------------------------- /** - * Return the last index in the target text at which it matches - * the search pattern and adjusts the iteration to point to that position. - * - * @return The index of the first match, or DONE if there - * are no matches. - */ - final public int last() { - setIndex(DONE); - return previous(); - } - - /** - * Return the first index less thanpos
at which the target - * text matches the search pattern. The iterator is adjusted so that its current index - * (as returned by {@link #getIndex}) is the match posisition if one was found - * and DONE if one was not. - * - * @return The character index of the first match precedingpos
, - * orDONE
if there are no matches. - */ - final public int preceding(int pos) { - setIndex(pos); - return previous(); - } - - /** - * Return the index of the next point at which the text matches the - * search pattern, starting from the current position. - * @return The index of the next match after the current position, - * orDONE
if there are no more matches. - * - * @see #first - */ - public int next() { - if (index == BEFORE){ - // Starting at the beginning of the text - index = target.getBeginIndex(); - } else if (length > 0) { - // Finding the next match after a previous one - index += overlap ? 1 : length; - } - index -= 1; - - do { - length = 0; - index = handleNext(index + 1); - } while (index != DONE && !isBreakUnit(index, index+length)); - - return index; - } - - /** - * Return the index of the previous point at which the text matches - * the search pattern, starting at the current position - * - * @return The index of the previous match before the current position, - * orDONE
if there are no more matches. - */ - public int previous() { - if (index == DONE) { - index = target.getEndIndex(); - } else if (length > 0) { - // Finding the previous match before a following one - index = overlap ? index + length - 1 : index; - } - index += 1; - - do { - length = 0; - index = handlePrev(index - 1); - } while (index != DONE && !isBreakUnit(index, index+length)); - - if (index == DONE) { - index = BEFORE; - } - return getIndex(); - } - - - - /** - * Return the current index in the text being searched. - * If the iteration has gone past the end of the text - * (or past the beginning for a backwards search), - * {@link #DONE} is returned. - */ - public int getIndex() { - return index == BEFORE ? DONE : index; - } - - /** - * Determines whether overlapping matches are returned. If this - * property istrue
, matches that begin within the - * boundry of the previous match are considered valid and will - * be returned. For example, when searching for "abab" in the - * target text "ababab", both offsets 0 and 2 will be returned - * as valid matches if this property istrue
. *- * The default setting of this property is true + * Sets the position in the target text which the next search will start + * from to the argument. This method clears all previous states. + *
+ * @param position index to start next search from. + * @exception IndexOutOfBoundsException thrown if argument position is out + * of the target text range. + * @see #getIndex + * @draft release 2.2 */ - public void setOverlapping(boolean allowOverlap) { - overlap = allowOverlap; - } - - /** - * Determines whether overlapping matches are returned. - * - * @see #setOverlapping - */ - public boolean isOverlapping() { - return overlap; - } - - /** - * Returns the length of text in the target which matches the search - * pattern. This call returns a valid result only after a successful - * call to {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. - * Just after construction, or after a searching method returns - * DONE, this method will return 0. - * - * @return The length of the match in the target text, or 0 if there - * is no match currently. - */ - public int getMatchLength() { - return length; - } - - /** - * Set the BreakIterator that will be used to restrict the points - * at which matches are detected. - * - * @param breaker A {@link java.text.BreakIterator BreakIterator} - * that will be used to restrict the points - * at which matches are detected. If a match is found, but the match's start - * or end index is not a boundary as determined by - * the BreakIterator, the match will be rejected and - * another will be searched for. - * - * If this parameter is null, no break - * detection is attempted. - * - * @see #getBreakIterator - */ - public void setBreakIterator(BreakIterator iterator) { - breaker = iterator; - if (breaker != null) { - breaker.setText(target); + public void setIndex(int position) { + if (position < targetText.getBeginIndex() + || position > targetText.getEndIndex()) { + throw new IndexOutOfBoundsException( + "setIndex(int) expected position to be between " + + targetText.getBeginIndex() + " and " + targetText.getEndIndex()); } + m_setOffset_ = position; + m_reset_ = false; + matchLength = 0; } - - /** - * Returns the BreakIterator that is used to restrict the points - * at which matches are detected. This will be the same object - * that was passed to the constructor or tosetBreakIterator
. - * Note that null is a legal value; it means that break + + /** + *+ * Determines whether overlapping matches are returned. See the class + * documentation for more information about overlapping matches. + *
+ *+ * The default setting of this property is false + *
+ * @param allowOverlap flag indicator if overlapping matches are allowed + * @see #isOverlapping + * @draft release 2.2 + */ + public void setOverlapping(boolean allowOverlap) + { + m_isOverlap_ = allowOverlap; + } + + /** + * Set the BreakIterator that is used to restrict the points at which + * matches are detected. + * Using null as the parameter is legal; it means that break * detection should not be attempted. - * - * @see #setBreakIterator + * See class documentation for more information. + * @param breakiter A BreakIterator that will be used to restrict the + * points at which matches are detected. + * @see #getBreakIterator + * @see BreakIterator */ - public BreakIterator getBreakIterator() { - return breaker; - } - - /** - * Set the target text which should be searched and resets the - * iterator's position to point before the start of the target text. - * This method is useful if you want to re-use an iterator to - * search for the same pattern within a different body of text. - * - * @see #getTarget - */ - public void setTarget(CharacterIterator iterator) { - target = iterator; - if (breaker != null) { - breaker.setText(target); + public void setBreakIterator(BreakIterator breakiter) + { + breakIterator = breakiter; + if (breakIterator != null) { + breakIterator.setText(targetText); } - setIndex(BEFORE); } /** - * Return the target text which is being searched - * + * Set the target text to be searched. Text iteration will hence begin at + * the start of the text string. This method is useful if you want to + * re-use an iterator to search within a different body of text. + * @param text new text iterator to look for match, + * @exception IllegalArgumentException thrown when text is null or has + * 0 length + * @see #getTarget + * @draft ICU 2.0 + */ + public void setTarget(CharacterIterator text) + { + if (text == null || text.getEndIndex() == text.getIndex()) { + throw new IllegalArgumentException("Illegal null or empty text"); + } + + targetText = text; + targetText.setIndex(targetText.getBeginIndex()); + matchLength = 0; + m_reset_ = true; + if (breakIterator != null) { + breakIterator.setText(targetText); + } + } + + // public getters ---------------------------------------------------- + + /** + *+ * Returns the index to the most recent match in the target text that was + * searched. + * This call returns a valid result only after a successful call to + * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * Just after construction, or after a searching method returns + * DONE, this method will return DONE. + *
+ *+ * Use getMatchLength to get the matched text length. + * getMatchedText will return the subtext in the searched + * target text from index getMatchStart() with length getMatchLength(). + *
+ * @return index to a substring within the text string that is being + * searched. + * @see #getMatchLength + * @see #getMatchedText + * @see #first + * @see #next + * @see #previous + * @see #last + * @see #DONE + * @draft release 2.2 + */ + public int getMatchStart() + { + return targetText.getIndex(); + } + + /** + * Return the index in the target text where the iterator is currently + * positioned at. + * If the iteration has gone past the end of the target text or past + * the beginning for a backwards search, {@link #DONE} is returned. + * @return index in the target text where the iterator is currently + * positioned at. + * @draft release 2.2 + * @see #first + * @see #next + * @see #previous + * @see #last + * @see #DONE + */ + public abstract int getIndex(); + + /** + *+ * Returns the subtext length of the most recent match in the target text. + * This call returns a valid result only after a successful + * call to {@link #first}, {@link #next}, {@link #previous}, or + * {@link #last}. + * Just after construction, or after a searching method returns + * DONE, this method will return 0. See getMatchStart() for + * more details. + *
+ * @return The length of the most recent match in the target text, or 0 if + * there is no match. + * @see #getMatchStart + * @see #getMatchedText + * @see #first + * @see #next + * @see #previous + * @see #last + * @see #DONE + */ + public int getMatchLength() + { + return matchLength; + } + + /** + * Returns the BreakIterator that is used to restrict the indexes at which + * matches are detected. This will be the same object that was passed to + * the constructor or tosetBreakIterator
. + * If the BreakIterator has not been set, null will be returned. + * See setBreakIterator for more information. + * @return the BreakIterator set to restrict logic matches + * @see #setBreakIterator + * @see BreakIterator + */ + public BreakIterator getBreakIterator() + { + return breakIterator; + } + + /** + * Return the target text which is being searched. + * @return target text being searched. * @see #setTarget */ - public CharacterIterator getTarget() { - return target; + public CharacterIterator getTarget() + { + return targetText; } /** * Returns the text that was matched by the most recent call to - * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. - * If the iterator is not pointing at a valid match (e.g. just after - * construction or after DONE has been returned, returns - * an empty string. + * {@link #first}, {@link #next}, {@link #previous}, or {@link #last}. + * If the iterator is not pointing at a valid match, for instance just + * after construction or after DONE has been returned, an empty + * String will be returned. See getMatchStart for more information + * @see #getMatchStart + * @see #getMatchLength + * @see #first + * @see #next + * @see #previous + * @see #last + * @see #DONE + * @return the subtext in target text of the most recent match */ - public String getMatchedText() { - StringBuffer buffer = new StringBuffer(); - - if (length > 0) { - int i = 0; - for (char c = target.setIndex(index); i < length; c = target.next(), i++) - { - buffer.append(c); - } - } - return buffer.toString(); + public String getMatchedText() + { + if (matchLength > 0) { + int start = targetText.getIndex(); + int limit = start + matchLength; + StringBuffer result = new StringBuffer(matchLength); + result.append(targetText.current()); + targetText.next(); + while (targetText.getIndex() < limit) { + result.append(targetText.current()); + targetText.next(); + } + targetText.setIndex(start); + return result.toString(); + } + return null; } - //------------------------------------------------------------------- - // Protected interface for subclasses - //------------------------------------------------------------------- + // miscellaneous public methods ----------------------------------------- + + /** + * Returns the index of the next forwards valid match in the target + * text, + * starting the search from the current iterator position. The iterator is + * adjusted so that its current index, as returned by {@link #getIndex}, + * is the starting position of the match if one was found. If a match is + * not found, DONE will be returned. + * @return The starting index of the next forward match after the current + * iterator position, or + * DONE if there are no more matches. + * @see #getMatchStart + * @see #getMatchLength + * @see #getMatchedText + * @see #following + * @see #preceding + * @see #previous + * @see #first + * @see #last + * @see #DONE + */ + public int next() + { + int start = targetText.getIndex(); + if (m_setOffset_ != DONE) { + start = m_setOffset_; + m_setOffset_ = DONE; + } + if (m_isForwardSearching_) { + if (!m_reset_ && + start + matchLength >= targetText.getEndIndex()) { + // not enough characters to match + matchLength = 0; + targetText.setIndex(targetText.getEndIndex()); + return DONE; + } + m_reset_ = false; + } + else { + // switching direction. + // if matchedIndex == USEARCH_DONE, it means that either a + // setIndex has been called or that previous ran off the text + // string. the iterator would have been set to offset 0 if a + // match is not found. + m_isForwardSearching_ = true; + if (start != DONE) { + // there's no need to set the collation element iterator + // the next call to next will set the offset. + return start; + } + } + + if (start == DONE) { + start = targetText.getBeginIndex(); + } + return handleNext(start); + } /** - * Constructor for use by subclasses. - *- * @param target The target text to be searched. This is for internal - * use by this class. Subclasses need to maintain their - * own reference to or iterator over the target text - * for use by their {@link #handleNext handleNext} and - * {@link #handlePrev handlePrev} methods. - * - * @param breaker A {@link BreakIterator} that is used to restrict the points - * at which matches are detected. If handleNext or - * handlePrev finds a match, but the match's start - * or end index is not a boundary as determined by - * the BreakIterator, the match is rejected and - * handleNext or handlePrev is called again. - * If this parameter is null, no break - * detection is attempted. - * + * Returns the index of the next backwards valid match in the target + * text, + * starting the search from the current iterator position. The iterator is + * adjusted so that its current index, as returned by {@link #getIndex}, + * is the starting position of the match if one was found. If a match is + * not found, DONE will be returned. + * @return The starting index of the next backwards match after the current + * iterator position, or + * DONE if there are no more matches. + * @see #getMatchStart + * @see #getMatchLength + * @see #getMatchedText + * @see #following + * @see #preceding + * @see #next + * @see #first + * @see #last + * @see #DONE + */ + public int previous() + { + int start = targetText.getIndex(); + if (m_setOffset_ != DONE) { + start = m_setOffset_; + m_setOffset_ = DONE; + } + if (m_reset_) { + m_isForwardSearching_ = false; + m_reset_ = false; + start = targetText.getEndIndex();; + } + + if (m_isForwardSearching_ == true) { + // switching direction. + // if matchedIndex == USEARCH_DONE, it means that either a + // setIndex has been called or that next ran off the text + // string. the iterator would have been set to offset textLength if + // a match is not found. + m_isForwardSearching_ = false; + if (start != DONE) { + return start; + } + start = targetText.getEndIndex(); + } + else { + if (start == DONE) { + return DONE; + } + if (start == targetText.getBeginIndex()) { + // not enough characters to match + matchLength = 0; + targetText.setIndex(targetText.getBeginIndex()); + return DONE; + } + } + + return handlePrevious(start); + } + + /** + * Checks if the overlapping property has been set. + * See setOverlapping(boolean) for more information. + * @see #setOverlapping + * @return true if the overlapping property has been set, false otherwise + * @draft release 2.2 + */ + public boolean isOverlapping() + { + return m_isOverlap_; + } + + /** + *
+ * Resets the search iteration. All properties will be reset to the + * default value. + *
+ *+ * Search will begin at the start of the target text if a forward iteration + * is initiated before a backwards iteration. Otherwise if a + * backwards iteration is initiated before a forwards iteration, the search + * will begin at the end of the target text. + *
+ * @draft release 2.2 + */ + public void reset() + { + // reset is setting the attributes that are already in string search + matchLength = 0; + setIndex(targetText.getBeginIndex()); + m_isOverlap_ = false; + m_isForwardSearching_ = true; + m_reset_ = true; + m_setOffset_ = DONE; + } + + /** + * Return the index of the first forward match in the target text. + * This method effectively sets the iteration to begin at the start of the + * target text and searches forwards from there. + * The iterator is + * adjusted so that its current index, as returned by {@link #getIndex}, + * is the starting position of the match if one was found. If a match is + * not found, DONE will be returned. + * @return The index of the first forward match, orDONE
+ * if there are no matches. + * @see #getMatchStart + * @see #getMatchLength + * @see #getMatchedText + * @see #following + * @see #preceding + * @see #next + * @see #previous + * @see #last + * @see #DONE + */ + public final int first() + { + m_isForwardSearching_ = true; + setIndex(targetText.getBeginIndex()); + return next(); + } + + /** + * Return the index of the first forward match in target text that + * is greater than argument position. + * This method effectively sets the iteration to begin at the argument + * position index of the target text and searches forwards from there. + * The iterator is + * adjusted so that its current index, as returned by {@link #getIndex}, + * is the starting position of the match if one was found. If a match is + * not found, DONE will be returned. + * @return The index of the first forward match, orDONE
+ * if there are no matches. + * @see #getMatchStart + * @see #getMatchLength + * @see #getMatchedText + * @see #first + * @see #preceding + * @see #next + * @see #previous + * @see #last + * @see #DONE + */ + public final int following(int position) + { + m_isForwardSearching_ = true; + // position checked in usearch_setOffset + setIndex(position); + return next(); + } + + /** + * Return the index of the last forward match in target text. + * This method effectively sets the iteration to begin at the end of the + * target text and searches backwards from there. + * The iterator is + * adjusted so that its current index, as returned by {@link #getIndex}, + * is the starting position of the match if one was found. If a match is + * not found, DONE will be returned. + * @return The starting index of the last forward match, or + *DONE
if there are no matches. + * @see #getMatchStart + * @see #getMatchLength + * @see #getMatchedText + * @see #first + * @see #preceding + * @see #next + * @see #previous + * @see #following + * @see #DONE + */ + public final int last() + { + m_isForwardSearching_ = false; + setIndex(targetText.getEndIndex()); + return previous(); + } + + /** + * Return the index of the first backwards match in target + * text that is less than argument position. + * This method effectively sets the iteration to begin at the argument + * position index of the target text and searches backwards from there. + * The iterator is + * adjusted so that its current index, as returned by {@link #getIndex}, + * is the starting position of the match if one was found. If a match is + * not found, DONE will be returned. + * @return The starting index of the first backwards match, or + *DONE
+ * if there are no matches. + * @see #getMatchStart + * @see #getMatchLength + * @see #getMatchedText + * @see #first + * @see #following + * @see #next + * @see #previous + * @see #last + * @see #DONE + */ + public final int preceding(int position) + { + m_isForwardSearching_ = false; + // position checked in usearch_setOffset + setIndex(position); + return previous(); + } + + // protected data member ---------------------------------------------- + + /** + * The BreakIterator to define the boundaries of a logical match. + * This value can be a null. + * See class documentation for more information. + * @see #setBreakIterator(BreakIterator) + * @see #getBreakIterator + * @see BreakIterator + */ + protected BreakIterator breakIterator; + /** + * Target text for searching. + * @see #setTarget(CharacterIterator) + * @see #getTarget + */ + protected CharacterIterator targetText; + /** + * Length of the most current match in target text. + * Value 0 is the default value. + * @see #setMatchLength + * @see #getMatchLength + */ + protected int matchLength; + + // protected constructor ---------------------------------------------- + + /** + * Protected constructor for use by subclasses. + * Initializes the iterator with the argument target text for searching + * and sets the BreakIterator. + * See class documentation for more details on the use of the target text + * and BreakIterator. + * @param target The target text to be searched. + * @param breaker A {@link BreakIterator} that is used to determine the + * boundaries of a logical match. This argument can be null. + * @exception IllegalArgumentException thrown when argument target is null, + * or of length 0 + * @see BreakIterator */ protected SearchIterator(CharacterIterator target, BreakIterator breaker) { - this.target = target; - - if (breaker != null) { - this.breaker = (BreakIterator)breaker.clone(); - this.breaker.setText(target); + if (target == null + || (target.getEndIndex() - target.getBeginIndex()) == 0) { + throw new IllegalArgumentException( + "Illegal argument target. " + + " Argument can not be null or of length 0"); } - - index = target.getBeginIndex(); - length = 0; - } - - /** - * Abstract method which subclasses override to provide the mechanism - * for finding the next match in the target text. This allows different - * subclasses to provide different search algorithms. - *- * If a match is found, the implementation should return the index at - * which the match starts and should call {@link #setMatchLength setMatchLength} - * with the number of characters in the target - * text that make up the match. If no match is found, the method - * should return DONE and should not call setMatchLength. - *
- * @param startAt The index in the target text at which the search - * should start. - * - * @see #setMatchLength - */ - protected abstract int handleNext(int startAt); - - /** - * Abstract method which subclasses override to provide the mechanism - * for finding the previous match in the target text. This allows different - * subclasses to provide different search algorithms. - *
- * If a match is found, the implementation should return the index at - * which the match starts and should call {@link #setMatchLength setMatchLength} - * with the number of characters in the target - * text that make up the match. If no match is found, the method - * should return DONE and should not call setMatchLength. - *
- * @param startAt The index in the target text at which the search - * should start. - * - * @see #setMatchLength - */ - protected abstract int handlePrev(int startAt); - - /** - * Sets the length of the currently matched string in the target text. - * Subclasses'
handleNext
andhandlePrev
- * methods should call this when they find a match in the target text. - */ - protected void setMatchLength(int length) { - this.length = length; - } - - //------------------------------------------------------------------- - // Privates - // + targetText = target; + breakIterator = breaker; + if (breakIterator != null) { + breakIterator.setText(target); + } + matchLength = 0; + m_isOverlap_ = false; + m_isForwardSearching_ = true; + m_reset_ = true; + m_setOffset_ = DONE; + } + // protected methods -------------------------------------------------- + + /** - * Internal method used by preceding and following. Sets the index - * to point to the given position, and clears any state that's - * affected. - */ - private void setIndex(int pos) { - index = pos; - length = 0; - } - - /** - * Determine whether the target text bounded bystart
and - *end
is one or more whole units of text as determined by - * the currentBreakIterator
. - */ - private boolean isBreakUnit(int start, int end) + * Sets the length of the most recent match in the target text. + * Subclasses' handleNext() and handlePrevious() methods should call this + * after they find a match in the target text. + * @param length new length to set + * @see #handleNext + * @see #handlePrevious + */ + protected void setMatchLength(int length) { - if (breaker == null) { - return true; - } - boolean startBound = breaker.isBoundary(start); - boolean endBound = (end == target.getEndIndex()) || breaker.isBoundary(end); - - return startBound && endBound; + matchLength = length; } + + /** + *+ * Abstract method which subclasses override to provide the mechanism + * for finding the next forwards match in the target text. This + * allows different subclasses to provide different search algorithms. + *
+ *+ * If a match is found, setMatchLength(int) would have to be called to + * set the length of the result match. + * The iterator is adjusted so that its current index, as returned by + * {@link #getIndex}, is the starting position of the match if one was + * found. If a match is not found, DONE will be returned. + *
+ * @param start index in the target text at which the forwards search + * should begin. + * @return the starting index of the next forwards match if found, DONE + * otherwise + * @see #setMatchLength(int) + * @see #handlePrevious(int) + * @see #DONE + */ + protected abstract int handleNext(int start); - //------------------------------------------------------------------------- - // Private data... - //------------------------------------------------------------------------- - private int index; // Current position in the target text - private int length; // Length of matched text, or 0 - private boolean overlap = true; // Return overlapping matches? - private CharacterIterator target; // Target text to be searched - private BreakIterator breaker; // Break iterator to constrain matches -}; + /** + *+ * Abstract method which subclasses override to provide the mechanism + * for finding the next backwards match in the target text. + * This allows different + * subclasses to provide different search algorithms. + *
+ *+ * If a match is found, setMatchLength(int) would have to be called to + * set the length of the result match. + * The iterator is adjusted so that its current index, as returned by + * {@link #getIndex}, is the starting position of the match if one was + * found. If a match is not found, DONE will be returned. + *
+ * @param start index in the target text at which the backwards search + * should begin. + * @return the starting index of the next backwards match if found, + * DONE otherwise + * @see #setMatchLength(int) + * @see #handleNext(int) + * @see #DONE + */ + protected abstract int handlePrevious(int startAt); + + // private data members ------------------------------------------------ + + /** + * Flag indicates if we are doing a forwards search + */ + private boolean m_isForwardSearching_; + /** + * Flag to indicate if overlapping search is to be done. + * E.g. looking for "aa" in "aaa" will yield matches at offset 0 and 1. + */ + private boolean m_isOverlap_; + /** + * Flag indicates if we are at the start of a string search. + * This indicates that we are in forward search and at the start of m_text. + */ + private boolean m_reset_; + /** + * Data member to store user defined position in setIndex(). + * If setIndex() is not called, this value will be DONE. + */ + private int m_setOffset_; +} diff --git a/icu4j/src/com/ibm/icu/text/StringSearch.java b/icu4j/src/com/ibm/icu/text/StringSearch.java index 44dfa5a3747..e33989c1621 100755 --- a/icu4j/src/com/ibm/icu/text/StringSearch.java +++ b/icu4j/src/com/ibm/icu/text/StringSearch.java @@ -5,642 +5,3081 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/StringSearch.java,v $ - * $Date: 2002/03/20 05:11:16 $ - * $Revision: 1.6 $ + * $Date: 2002/06/21 23:56:48 $ + * $Revision: 1.7 $ * ***************************************************************************************** */ package com.ibm.icu.text; -import java.text.BreakIterator; import java.text.CharacterIterator; -import java.text.CollationElementIterator; -import java.text.Collator; -import java.text.RuleBasedCollator; import java.text.StringCharacterIterator; import java.util.Locale; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.impl.NormalizerImpl; /** - *StringSearch
is aSearchIterator
that provides - * language-sensitive text searching based on the comparison rules defined - * in a {@link RuleBasedCollator} object. - * Instances ofStringSearch
function as iterators - * maintain a current position and scan over text returning the index of - * characters where the pattern occurs and the length of each match. + *+ *
*StringSearch
is the concrete subclass of + *SearchIterator
that provides language-sensitive text searching + * based on the comparison rules defined in a {@link RuleBasedCollator} object. + **
*StringSearch
uses a version of the fast Boyer-Moore search * algorithm that has been adapted to work with the large character set of - * Unicode. See "Efficient Text Searching in Java", to be published in - * Java Report in February, 1999, for further information on the algorithm. + * Unicode. Refer to + * + * "Efficient Text Searching in Java", published in the + * Java Report on February, 1999, for further information on the + * algorithm. + *- * Consult the
+ *SearchIterator
documentation for information on - * and examples of how to use instances of this class to implement text - * searching.SearchIterator
provides all of the necessary - * API; this class only provides constructors and internal implementation - * methods. - * + * Users are also strongly encouraged to read the section on + * + * String Search and + * + * Collation in the user guide before attempting to use this class. + *+ * String searching gets alittle complicated when accents are encountered at + * match boundaries. If a match is found and it has preceding or trailing + * accents not part of the match, the result returned will include the + * preceding accents up to the first base character, if the pattern searched + * for starts an accent. Likewise, + * if the pattern ends with an accent, all trailing accents up to the first + * base character will be included in the result. + *
+ *+ * For example, if a match is found in target text "a\u0325\u0300" for + * the pattern + * "a\u0325", the result returned by StringSearch will be the index 0 and + * length 3 <0, 3>. If a match is found in the target + * "a\u0325\u0300" + * for the pattern "\u0300", then the result will be index 1 and length 2 + * <1, 2>. + *
+ *+ * In the case where the decomposition mode is on for the RuleBasedCollator, + * all matches that starts or ends with an accent will have its results include + * preceding or following accents respectively. For example, if pattern "a" is + * looked for in the target text "á\u0325", the result will be + * index 0 and length 2 <0, 2>. + *
+ *+ * The StringSearch class provides two options to handle accent matching + * described below: + *
+ *+ * Let S' be the sub-string of a text string S between the offsets start and + * end <start, end>. + *
+ * A pattern string P matches a text string S at the offsets <start, + * length> + *
+ * if + *+ * option 1. P matches some canonical equivalent string of S'. Suppose the + * RuleBasedCollator used for searching has a collation strength of + * TERTIARY, all accents are non-ignorable. If the pattern + * "a\u0300" is searched in the target text + * "a\u0325\u0300", + * a match will be found, since the target text is canonically + * equivalent to "a\u0300\u0325" + * option 2. P matches S' and if P starts or ends with a combining mark, + * there exists no non-ignorable combining mark before or after S’ + * in S respectively. Following the example above, the pattern + * "a\u0300" will not find a match in "a\u0325\u0300", + * since + * there exists a non-ignorable accent '\u0325' in the middle of + * 'a' and '\u0300'. Even with a target text of + * "a\u0300\u0325" a match will not be found because of the + * non-ignorable trailing accent \u0325. + *+ * Option 2. will be the default mode for dealing with boundary accents unless + * specified via the API setCanonical(boolean). + * One restriction is to be noted for option 1. Currently there are no + * composite characters that consists of a character with combining class > 0 + * before a character with combining class == 0. However, if such a character + * exists in the future, the StringSearch may not work correctly with option 1 + * when such characters are encountered. + * + *+ * SearchIterator provides APIs to specify the starting position + * within the text string to be searched, e.g. setIndex, + * preceding and following. Since the starting position will + * be set as it is specified, please take note that there are some dangerous + * positions which the search may render incorrect results: + *
+ *
+ * + *- The midst of a substring that requires decomposition. + *
- If the following match is to be found, the position should not be the + * second character which requires to be swapped with the preceding + * character. Vice versa, if the preceding match is to be found, + * position to search from should not be the first character which + * requires to be swapped with the next character. E.g certain Thai and + * Lao characters require swapping. + *
- If a following pattern match is to be found, any position within a + * contracting sequence except the first will fail. Vice versa if a + * preceding pattern match is to be found, a invalid starting point + * would be any character within a contracting sequence except the last. + *
+ * Though collator attributes will be taken into consideration while + * performing matches, there are no APIs provided in StringSearch for setting + * and getting the attributes. These attributes can be set by getting the + * collator from getCollator and using the APIs in + * com.ibm.icu.text.Collator. To update StringSearch to the new + * collator attributes, reset() or + * setCollator(RuleBasedCollator) has to be called. + *
+ *+ * Consult the + * + * String Search user guide and the
* @see SearchIterator - * @see java.text.RuleBasedCollator - * - * @author Laura Werner - * @version 1.0 + * @see RuleBasedCollator + * @author Laura Werner, synwee + * @since 1.0 */ +// internal notes: all methods do not guarantee the correct status of the +// characteriterator. the caller has to maintain the original index position +// if necessary. methods could change the index position as it deems fit public final class StringSearch extends SearchIterator { + + // public constructors -------------------------------------------------- + /** - * Construct aSearchIterator
+ * documentation for more information and examples of use. + *StringSearch
object using a specific collator and set - * of boundary-detection rules. - *- * @param pat The text for which this object will search. - * - * @param target The text in which to search for the pattern. - * - * @param coll A
RuleBasedCollator
object which defines the - * language-sensitive comparison rules used to determine - * whether text in the pattern and target matches. - * - * @param breaker ABreakIterator
object used to constrain the matches - * that are found. Matches whose start and end indices - * in the target text are not boundaries as determined - * by theBreakIterator
are ignored. If this behavior - * is not desired,null
can be passed in instead. + * Initializes the iterator to use the language-specific rules defined in + * the argument collator to search for argument pattern in the argument + * target text. The argument breakiter is used to define logical matches. + * See super class documentation for more details on the use of the target + * text and BreakIterator. + * @param pattern text to look for. + * @param target target text to search for pattern. + * @param collator RuleBasedCollator that defines the language rules + * @param breaker A {@link BreakIterator} that is used to determine the + * boundaries of a logical match. This argument can be null. + * @exception IllegalArgumentException thrown when argument target is null, + * or of length 0 + * @see BreakIterator + * @see RuleBasedCollator + * @see SearchIterator */ - public StringSearch(String pat, CharacterIterator target, - RuleBasedCollator coll, BreakIterator breaker) { - super(target, breaker); - - pattern = pat; - collator = coll; - strength = coll.getStrength(); - iter = collator.getCollationElementIterator(target); + public StringSearch(String pattern, CharacterIterator target, + RuleBasedCollator collator, BreakIterator breakiter) + { + super(target, breakiter); + m_textBeginOffset_ = targetText.getBeginIndex(); + m_collator_ = collator; + m_colEIter_ = m_collator_.getCollationElementIterator(target); + m_utilColEIter_ = collator.getCollationElementIterator(""); + m_ceMask_ = getMask(m_collator_.getStrength()); + m_isCanonicalMatch_ = false; + m_pattern_ = new Pattern(pattern); + m_matchedIndex_ = DONE; - initialize(); // Initialize the Boyer-Moore tables + initialize(); } /** - * Construct aStringSearch
object using a specific collator. - *- * @param pattern The text for which this object will search. - * - * @param target The text in which to search for the pattern. - * - * @param collator A
RuleBasedCollator
object which defines the - * language-sensitive comparison rules used to determine - * whether text in the pattern and target matches. + * Initializes the iterator to use the language-specific rules defined in + * the argument collator to search for argument pattern in the argument + * target text. No BreakIterators are set to test for logical matches. + * @param pattern text to look for. + * @param target target text to search for pattern. + * @param collator RuleBasedCollator that defines the language rules + * @exception IllegalArgumentException thrown when argument target is null, + * or of length 0 + * @see RuleBasedCollator + * @see SearchIterator */ - public StringSearch(String pattern, - CharacterIterator target, - RuleBasedCollator collator) { + public StringSearch(String pattern, CharacterIterator target, + RuleBasedCollator collator) + { this(pattern, target, collator, BreakIterator.getCharacterInstance()); } /** - * Construct aStringSearch
object using the collator and - * character boundary detection rules for a given locale. - * @param pattern The text for which this object will search. - * - * @param target The text in which to search for the pattern. - * - * @param loc The locale whose collation and break-detection rules - * should be used. - * - * @exception ClassCastException thrown if the collator for the specified - * locale is not a RuleBasedCollator. + * Initializes the iterator to use the language-specific rules and + * break iterator rules defined in the argument locale to search for + * argument pattern in the argument target text. + * See super class documentation for more details on the use of the target + * text and BreakIterator. + * @param pattern text to look for. + * @param target target text to search for pattern. + * @param locale locale to use for language and break iterator rules + * @exception IllegalArgumentException thrown when argument target is null, + * or of length 0. ClassCastException thrown if the collator for + * the specified locale is not a RuleBasedCollator. + * @see BreakIterator + * @see RuleBasedCollator + * @see SearchIterator */ - public StringSearch(String pattern, CharacterIterator target, Locale loc) { - this(pattern, target, - (RuleBasedCollator) Collator.getInstance(loc), - BreakIterator.getCharacterInstance(loc)); + public StringSearch(String pattern, CharacterIterator target, Locale locale) + { + this(pattern, target, (RuleBasedCollator)Collator.getInstance(locale), + BreakIterator.getCharacterInstance(locale)); } /** - * Construct aStringSearch
object using the collator for the default - * locale. - * @param pattern The text for which this object will search. - * - * @param target The text in which to search for the pattern. - * - * @param collator ARuleBasedCollator
object which defines the - * language-sensitive comparison rules used to determine - * whether text in the pattern and target matches. + * Initializes the iterator to use the language-specific rules and + * break iterator rules defined in the default locale to search for + * argument pattern in the argument target text. + * See super class documentation for more details on the use of the target + * text and BreakIterator. + * @param pattern text to look for. + * @param target target text to search for pattern. + * @exception IllegalArgumentException thrown when argument target is null, + * or of length 0. ClassCastException thrown if the collator for + * the default locale is not a RuleBasedCollator. + * @see BreakIterator + * @see RuleBasedCollator + * @see SearchIterator */ - public StringSearch(String pattern, String target) { - this(pattern, - new StringCharacterIterator(target), + public StringSearch(String pattern, String target) + { + this(pattern, new StringCharacterIterator(target), (RuleBasedCollator)Collator.getInstance(), BreakIterator.getCharacterInstance()); } - //------------------------------------------------------------------- - // Getters and Setters - //------------------------------------------------------------------- + // public getters ----------------------------------------------------- /** - * Sets this object's strength property. The strength determines the - * minimum level of difference considered significant during a - * search. Generally, {@link Collator#TERTIARY} and - * {@link Collator#IDENTICAL} indicate that all differences are - * considered significant, {@link Collator#SECONDARY} indicates - * that upper/lower case distinctions should be ignored, and - * {@link Collator#PRIMARY} indicates that both case and accents - * should be ignored. However, the exact meanings of these constants - * are determined by individual Collator objects. - *- * @see java.text.Collator#PRIMARY - * @see java.text.Collator#SECONDARY - * @see java.text.Collator#TERTIARY - * @see java.text.Collator#IDENTICAL + * Returns the strength property of the RuleBasedCollator used in searching. + * See the RuleBasedCollator class documentation for a description of the + * strength property. + * @return the strength property of the RuleBasedCollator used in searching + * @see RuleBasedCollator + * @see #setStrength + * @see #getCollator + * @deprecated since release 2.2, user who would like to access the + * RuleBasedCollator strength, should retrieve the + * RuleBasedCollator via the API getCollator(), and use the + * Collator APIs to retrieve the strength. */ - public void setStrength(int newStrength) { - strength = newStrength; - + public int getStrength() { + return m_collator_.getStrength(); + } + + /** + *
+ * Gets the RuleBasedCollator used for the language rules. + *
+ *+ * Since StringSearch depends on the returned RuleBasedCollator, any + * changes to the RuleBasedCollator result should follow with a call to + * either StringSearch.reset() or + * StringSearch.setCollator(RuleBasedCollator) to ensure the correct + * search behaviour. + *
+ * @return RuleBasedCollator used by this StringSearch + * @see RuleBasedCollator + * @see #setCollator + */ + public RuleBasedCollator getCollator() + { + return m_collator_; + } + + /** + * Returns the pattern for which StringSearch is searching for. + * @return the pattern searched for + */ + public String getPattern() + { + return m_pattern_.targetText; + } + + /** + * Return the index in the target text where the iterator is currently + * positioned at. + * If the iteration has gone past the end of the target text or past + * the beginning for a backwards search, {@link #DONE} is returned. + * @return index in the target text where the iterator is currently + * positioned at + * @draft release 2.2 + */ + public int getIndex() + { + int result = m_colEIter_.getOffset(); + if (isOutOfBounds(m_textBeginOffset_, m_textLimitOffset_, result)) { + return DONE; + } + return result; + } + + /** + * Determines whether canonical matches (option 1, as described in the + * class documentation) is set. + * See setCanonical(boolean) for more information. + * @see #setCanonical + * @return true if canonical matches is set, false otherwise + * @draft release 2.2 + */ + public boolean isCanonical() + { + return m_isCanonicalMatch_; + } + + // public setters ----------------------------------------------------- + + /** + *+ * Sets the strength property of the RuleBasedCollator used for searching. + * See the Collator documentation for a description of the strengths. + *
+ * @deprecated since release 2.2, user who would like to modify the + * RuleBasedCollator, should retrieve the RuleBasedCollator + * via the API getCollator(), and use the Collator APIs to + * modify the strength. After which StringSearch.reset() + * or StringSearch.setCollator(RuleBasedCollator) should be + * called to update StringSearch. + * @see Collator + * @see Collator#PRIMARY + * @see Collator#SECONDARY + * @see Collator#TERTIARY + * @see Collator#QUATERNARY + * @see Collator#IDENTICAL + * @see #setCollator + * @see #getCollator + */ + public void setStrength(int newStrength) + { // Due to a bug (?) in CollationElementIterator, we must set the // collator's strength as well, since the iterator is going to // mask out the portions of the collation element that are not // relevant for the collator's current strength setting // Note that this makes it impossible to share a Collator among // multiple StringSearch objects if you adjust Strength settings. - collator.setStrength(strength); + m_collator_.setStrength(newStrength); initialize(); } - /** - * Returns this object's strength property, which indicates what level - * of differences are considered significant during a search. *- * @see #setStrength - */ - public int getStrength() { - return strength; - } - - /** - * Set the collator to be used for this string search. Also changes - * the search strength to match that of the new collator. + * Sets the RuleBasedCollator to be used for language-specific searching. + *
** This method causes internal data such as Boyer-Moore shift tables * to be recalculated, but the iterator's position is unchanged. - *
+ *
+ * @param collator to use for this StringSearch + * @exception IllegalArgumentException thrown when collator is null * @see #getCollator */ - public void setCollator(RuleBasedCollator coll) { - collator = coll; - strength = collator.getStrength(); - - // Also need to recompute the pattern and get a new target iterator - iter = collator.getCollationElementIterator(getTarget()); + public void setCollator(RuleBasedCollator collator) + { + if (collator == null) { + throw new IllegalArgumentException("Collator can not be null"); + } + m_collator_ = collator; + m_ceMask_ = getMask(m_collator_.getStrength()); + // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT initialize(); + m_colEIter_.setCollator(m_collator_); + m_utilColEIter_.setCollator(m_collator_); } /** - * Return the RuleBasedCollator being used for this string search. - */ - public RuleBasedCollator getCollator() { - return collator; - } - - /** - * Set the pattern for which to search. + *+ * Set the pattern to search for. + *
+ ** This method causes internal data such as Boyer-Moore shift tables * to be recalculated, but the iterator's position is unchanged. + *
+ * @param pattern for searching + * @see #getPattern + * @exception IllegalArgumentException thrown if pattern is null or of + * length 0 */ - public void setPattern(String pat) { - pattern = pat; + public void setPattern(String pattern) + { + if (pattern == null || pattern.length() <= 0) { + throw new IllegalArgumentException( + "Pattern to search for can not be null or of length 0"); + } + m_pattern_.targetText = pattern; initialize(); } /** - * Returns the pattern for which this object is searching. - */ - public String getPattern() { - return pattern; - } + * Set the target text to be searched. Text iteration will hence begin at + * the start of the text string. This method is useful if you want to + * re-use an iterator to search within a different body of text. + * @param text new text iterator to look for match, + * @exception IllegalArgumentException thrown when text is null or has + * 0 length + * @see #getTarget + * @draft release 2.2 + */ + public void setTarget(CharacterIterator text) + { + super.setTarget(text); + m_textBeginOffset_ = targetText.getBeginIndex(); + m_colEIter_.setText(targetText); + } /** - * Set the target text which should be searched and resets the - * iterator's position to point before the start of the new text. - * This method is useful if you want to re-use an iterator to - * search for the same pattern within a different body of text. - */ - public void setTarget(CharacterIterator target) { - super.setTarget(target); - - // Since we're caching a CollationElementIterator, recreate it - iter = collator.getCollationElementIterator(target); - } - - //------------------------------------------------------------------- - // Privates - //------------------------------------------------------------------- + *+ * Sets the position in the target text which the next search will start + * from to the argument. This method clears all previous states. + *
+ *+ * This method takes the argument position and sets the position in the + * target text accordingly, without checking if position is pointing to a + * valid starting point to begin searching. + *
+ *+ * Search positions that may render incorrect results are highlighted in + * the class documentation. + *
+ * @param position index to start next search from. + * @exception IndexOutOfBoundsException thrown if argument position is out + * of the target text range. + * @see #getIndex + * @draft release 2.2 + */ + public void setIndex(int position) + { + super.setIndex(position); + m_matchedIndex_ = DONE; + m_colEIter_.setExactOffset(position); + } + + /** + *+ * Set the canonical match mode. See class documentation for details. + * The default setting for this property is false. + *
+ * @param allowCanonical flag indicator if canonical matches are allowed + * @see #isCanonical + * @draft release 2.2 + */ + public void setCanonical(boolean allowCanonical) + { + m_isCanonicalMatch_ = allowCanonical; + if (m_isCanonicalMatch_ == true) { + if (m_canonicalPrefixAccents_ == null) { + m_canonicalPrefixAccents_ = new StringBuffer(); + } + else { + m_canonicalPrefixAccents_.delete(0, + m_canonicalPrefixAccents_.length()); + } + if (m_canonicalSuffixAccents_ == null) { + m_canonicalSuffixAccents_ = new StringBuffer(); + } + else { + m_canonicalSuffixAccents_.delete(0, + m_canonicalSuffixAccents_.length()); + } + } + } + + // public miscellaneous methods ----------------------------------------- + + /** + *+ * Resets the search iteration. All properties will be reset to the + * default value. + *
+ *+ * Search will begin at the start of the target text if a forward iteration + * is initiated before a backwards iteration. Otherwise if a + * backwards iteration is initiated before a forwards iteration, the search + * will begin at the end of the target text. + *
+ *+ * Canonical match option will be reset to false, ie an exact match. + *
+ * @draft release 2.2 + */ + public void reset() + { + // reset is setting the attributes that are already in string search, + // hence all attributes in the collator should be retrieved without any + // problems + super.reset(); + m_isCanonicalMatch_ = false; + m_ceMask_ = getMask(m_collator_.getStrength()); + // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT + initialize(); + m_colEIter_.setCollator(m_collator_); + m_colEIter_.reset(); + m_utilColEIter_.setCollator(m_collator_); + } + // protected methods ----------------------------------------------------- + /** - * Search forward for matching text, starting at a given location. - * Clients should not call this method directly; instead they should call - * {@link SearchIterator#next}. *- * If a match is found, this method returns the index at which the match - * starts and calls {@link SearchIterator#setMatchLength} - * with the number of characters in the target - * text that make up the match. If no match is found, the method returns - *
DONE
and does not call setMatchLength. - *- * @param start The index in the target text at which the search starts. - * - * @return The index at which the matched text in the target starts, or DONE - * if no match was found. - *
- * @see SearchIterator#next - * @see SearchIterator#DONE + * Concrete method to provide the mechanism + * for finding the next forwards match in the target text. + * See super class documentation for its use. + *
+ * @param start index in the target text at which the forwards search + * should begin. + * @return the starting index of the next forwards match if found, DONE + * otherwise + * @see #handlePrevious(int) + * @see #DONE */ protected int handleNext(int start) { - CharacterIterator target = getTarget(); - - int mask = getMask(strength); - int done = CollationElementIterator.NULLORDER & mask; - - if (DEBUG) { - debug("-------------------------handleNext-----------------------------------"); - debug(""); - debug("strength=" + strength + ", mask=" + Integer.toString(mask,16) - + ", done=" + Integer.toString(done,16)); - debug("decomp=" + collator.getDecomposition()); - - debug("target.begin=" + getTarget().getBeginIndex()); - debug("target.end=" + getTarget().getEndIndex()); - debug("start = " + start); - } - - int index = start + minLen; - int matchEnd = 0; - - while (index <= target.getEndIndex()) - { - int patIndex = normLen; - int tval = 0, pval = 0; - boolean getP = true; - - iter.setOffset(index); - matchEnd = index; - - if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index + ", iter offset= " + iter.getOffset()); - - while ((patIndex > 0 || getP == false) && iter.getOffset() > start) - { - if (DEBUG) { - debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); - debug(" getP=" + getP); - } - - // Get the previous character in both the pattern and the target - tval = iter.previous() & mask; - - if (getP) pval = valueList[--patIndex]; - getP = true; - - if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16)); - - if (tval == 0) { // skip tval, use same pval - if (DEBUG) debug(" tval is ignorable"); - getP = false; - } - else if (pval != tval) { // Mismatch, skip ahead - if (DEBUG) debug(" mismatch: skippping " + getShift(tval, patIndex)); - - index += getShift(tval, patIndex); - break; - } - else if (patIndex == 0) { - // The values matched, and we're at the beginning of the pattern, - // which means we matched the whole thing. - start = iter.getOffset(); - setMatchLength(matchEnd - start); - if (DEBUG) debug("Found match at index "+ start ); - return start; - } + if (m_pattern_.m_CELength_ == 0) { + matchLength = 0; + if (m_matchedIndex_ == DONE && start == m_textBeginOffset_) { + m_matchedIndex_ = start; + return m_matchedIndex_; } - if (DEBUG) { - debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); - debug(" getP=" + getP); - } - if (index == matchEnd) { - // We hit the beginning of the text being searched, which is - // possible if it contains lots of ignorable characters. - // Advance one character and try again. - if (DEBUG) debug("hit beginning of target; advance by one"); - index++; - } - } - if (DEBUG) debug("Fell off end of outer loop; returning DONE"); - return DONE; + targetText.setIndex(start); + char ch = targetText.current(); + // ch can never be done, it is handled by next() + char ch2 = targetText.next(); + if (ch2 == CharacterIterator.DONE) { + m_matchedIndex_ = DONE; + } + else { + m_matchedIndex_ = targetText.getIndex(); + } + if (UTF16.isLeadSurrogate(ch) && UTF16.isTrailSurrogate(ch2)) { + targetText.next(); + m_matchedIndex_ = targetText.getIndex(); + } + } + else { + if (matchLength != 0) { + start += matchLength; + } + + // status checked below + if (m_isCanonicalMatch_) { + // can't use exact here since extra accents are allowed. + handleNextCanonical(start); + } + else { + handleNextExact(start); + } + } + targetText.setIndex(m_matchedIndex_); + return m_matchedIndex_; } - + /** - * Search backward for matching text ,starting at a given location. - * Clients should not call this method directly; instead they should call - *SearchIterator.previous()
, which this method overrides. - *- * If a match is found, this method returns the index at which the match - * starts and calls {@link SearchIterator#setMatchLength} - * with the number of characters in the target - * text that make up the match. If no match is found, the method returns - *
DONE
and does not call setMatchLength. - *- * @param start The index in the target text at which the search starts. - * - * @return The index at which the matched text in the target starts, or DONE - * if no match was found. - *
- * @see SearchIterator#previous - * @see SearchIterator#DONE - */ - protected int handlePrev(int start) + *
+ * Concrete method to provide the mechanism + * for finding the next backwards match in the target text. + * See super class documentation for its use. + *
+ * @param start index in the target text at which the backwards search + * should begin. + * @return the starting index of the next backwards match if found, DONE + * otherwise + * @see #handleNext(int) + * @see #DONE + */ + protected int handlePrevious(int start) { - int patLen = normLen; - int index = start - minLen; - - int mask = getMask(strength); - int done = CollationElementIterator.NULLORDER & mask; - - if (DEBUG) { - debug("-------------------------handlePrev-----------------------------------"); - debug(""); - debug("strength=" + strength + ", mask=" + Integer.toString(mask,16) - + ", done=" + Integer.toString(done,16)); - debug("decomp=" + collator.getDecomposition()); - - debug("target.begin=" + getTarget().getBeginIndex()); - debug("target.end=" + getTarget().getEndIndex()); - debug("start = " + start); - } - - while (index >= 0) { - int patIndex = 0; - int tval = 0, pval = 0; - boolean getP = true; - - iter.setOffset(index); - - if (DEBUG) debug(" outer loop: patIndex=" + patIndex + ", index=" + index); - - while ((patIndex < patLen || !getP) && iter.getOffset() < start) - { - if (DEBUG) { - debug(" inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); - debug(" getP=" + getP); - } - tval = iter.next() & mask; - if (getP) pval = valueList[patIndex++]; - getP = true; - - if (DEBUG) debug(" pval=" + Integer.toString(pval,16) + ", tval=" + Integer.toString(tval,16)); - - if (tval == done) { - if (DEBUG) debug(" end of target; no match"); - return DONE; - } - else if (tval == 0) { - if (DEBUG) debug(" tval is ignorable"); - getP = false; - } - else if (pval != tval) { - // We didn't match this pattern. Skip ahead - if (DEBUG) debug(" mismatch: skippping " + getBackShift(tval, patIndex)); - - int shift = getBackShift(tval, patIndex); - index -= shift; - break; - } - else if (patIndex == patLen) { - // The elements matched and we're at the end of the pattern, - // which means we matched the whole thing. - setMatchLength(iter.getOffset() - index); - if (DEBUG) debug("Found match at index "+ start ); - return index; - } + if (m_pattern_.m_CELength_ == 0) { + matchLength = 0; + // start can never be DONE or 0, it is handled in previous + targetText.setIndex(start); + char ch = targetText.previous(); + if (ch == CharacterIterator.DONE) { + m_matchedIndex_ = DONE; } - if (DEBUG) { - debug(" end of inner loop: patIndex=" + patIndex + " iter=" + iter.getOffset()); - debug(" getP=" + getP); - } - if (iter.getOffset() >= start) { - // We hit the end of the text being searched, which is - // possible if it contains lots of ignorable characters. - // Back up one character and try again. - if (DEBUG) debug("hit end of target; back by one"); - index--; + else { + m_matchedIndex_ = targetText.getIndex(); + if (UTF16.isTrailSurrogate(ch)) { + if (UTF16.isLeadSurrogate(targetText.previous())) { + m_matchedIndex_ = targetText.getIndex(); + } + } + } + } + else { + if (m_isCanonicalMatch_) { + // can't use exact here since extra accents are allowed. + handlePreviousCanonical(start); + } + else { + handlePreviousExact(start); } } - if (DEBUG) debug("Fell off end of outer loop; returning DONE"); - return DONE; + + targetText.setIndex(m_matchedIndex_); + return m_matchedIndex_; } - /** - * Return a bitmask that will select only the portions of a collation - * element that are significant at the given strength level. - */ - private static final int getMask(int strength) { - switch (strength) { - case Collator.PRIMARY: - return 0xFFFF0000; - case Collator.SECONDARY: - return 0xFFFFFF00; - default: - return 0xFFFFFFFF; - } - } + // private static inner classes ---------------------------------------- - - //------------------------------------------------------------------------ - // Private Data - // - private CollationElementIterator iter; - private RuleBasedCollator collator; - private int strength; - - //------------------------------------------------------------------------ - // Everything from here on down is the data used to represent the - // Boyer-Moore shift tables and the code that generates and manipulates - // them. - // - private static final int MAX_TABLE = 256; // Size of the shift tables - - private int valueList[] = null; - private int shiftTable[] = new int[MAX_TABLE]; - private int backShiftTable[] = new int[MAX_TABLE]; - - private String pattern; // The pattern string - private int normLen = 0; // num. of collation elements in pattern. - private int minLen = 0; // Min of composed, decomposed versions - private int maxLen = 0; // Max - - private void initialize() { - if (DEBUG) { - debug("-------------------------initialize-----------------------------------"); - debug("pattern=" + pattern); - } + private static class Pattern + { + // protected methods ----------------------------------------------- + + /** + * Pattern string + */ + protected String targetText; + /** + * Array containing the collation elements of targetText + */ + protected int m_CE_[]; + /** + * Number of collation elements in m_CE_ + */ + protected int m_CELength_; + /** + * Flag indicator if targetText starts with an accent + */ + protected boolean m_hasPrefixAccents_; + /** + * Flag indicator if targetText ends with an accent + */ + protected boolean m_hasSuffixAccents_; + /** + * Default number of characters to shift for Boyer Moore + */ + protected int m_defaultShiftSize_; + /** + * Number of characters to shift for Boyer Moore, depending on the + * source text to search + */ + protected char m_shift_[]; + /** + * Number of characters to shift backwards for Boyer Moore, depending + * on the source text to search + */ + protected char m_backShift_[]; - CollationElementIterator iter = collator.getCollationElementIterator(pattern); - - int mask = getMask(strength); - - // See how many non-ignorable collation keys are in the text - normLen = 0; - int elem; - while ((elem = iter.next()) != CollationElementIterator.NULLORDER) + // protected constructors ------------------------------------------ + + /** + * Empty constructor + */ + protected Pattern(String pattern) { - if ((elem & mask) != 0) { - normLen++; - } + targetText = pattern; + m_CE_ = new int[INITIAL_ARRAY_SIZE_]; + m_CELength_ = 0; + m_hasPrefixAccents_ = false; + m_hasSuffixAccents_ = false; + m_defaultShiftSize_ = 1; + m_shift_ = new char[MAX_TABLE_SIZE_]; + m_backShift_ = new char[MAX_TABLE_SIZE_]; } + }; - // Save them all - valueList = new int[normLen]; - int expandLen = 0; - iter.reset(); - - for (int i = 0; i < normLen; i++) - { - elem = iter.next(); - - if ((elem & mask) != 0) { - valueList[i] = elem & mask; - - } - // Keep track of whether there are any expanding-character - // sequences that can result in one of the characters that's in - // the pattern. If there are, we have to reduce the shift - // distances calculated below to account for it. - expandLen += iter.getMaxExpansion(elem) - 1; - } - - // - // We need to remember the size of the composed and decomposed - // versions of the string. Standard Boyer-Moore shift calculations - // can be wrong by an amount up to that difference, since a small - // small number of characters in the pattern can map to a larger - // number in the text being searched, or vice-versa. - // - int uniLen = pattern.length(); - maxLen = Math.max(normLen, uniLen); - minLen = Math.min(normLen, uniLen) - expandLen; - - if (DEBUG) debug("normLen=" + normLen + ", expandLen=" + expandLen - + ", maxLen=" + maxLen + ", minLen=" + minLen); - - // Now initialize the shift tables - // - // NOTE: This is the most conservative way to build them. If we had a way - // of knowing that there were no expanding/contracting chars in the rules, - // we could get rid of the "- 1" in the shiftTable calculations. - // But all of the default collators have at least one expansion or - // contraction, so it probably doesn't matter anyway. - // - for (int i = 0; i < MAX_TABLE; i++) { - shiftTable[i] = backShiftTable[i] = minLen; - } - - for (int i = 0; i < normLen-1; i++) { - shiftTable[hash(valueList[i])] = Math.max(minLen - i - 1, 1); - } - shiftTable[hash(valueList[normLen-1])] = 1; - - for (int i = normLen - 1; i > 0; i--) { - backShiftTable[hash(valueList[i])] = i; - } - backShiftTable[hash(valueList[0])] = 1; - - if (DEBUG) dumpTables(); - } + // private data members ------------------------------------------------ + /** - * Method used by StringSearch to determine how far to the right to - * shift the pattern during a Boyer-Moore search. - * - * @param curValue The current value in the target text - * @param curIndex The index in the pattern at which we failed to match - * curValue in the target text. + * target text begin offset. Each targetText has a valid contiguous region + * to iterate and this data member is the offset to the first such + * character in the region. */ - private int getShift( int curValue, int curIndex ) { - int shiftAmt = shiftTable[hash(curValue)]; - - // if (minLen != maxLen) { - int adjust = normLen - curIndex; - // if (shiftAmt > adjust + 1) { - if (adjust > 1 && shiftAmt >= adjust) { - if (DEBUG) debug("getShift: adjusting by " + adjust); - // shiftAmt -= adjust; - shiftAmt -= adjust - 1; - } - // } - return shiftAmt; - } - + private int m_textBeginOffset_; /** - * Method used by StringSearch to determine how far to the left to - * shift the pattern during a reverse Boyer-Moore search. - * - * @param curValue The current value in the target text - * @param curIndex The index in the pattern at which we failed to match - * curValue in the target text. + * target text limit offset. Each targetText has a valid contiguous region + * to iterate and this data member is the offset to 1 after the last such + * character in the region. */ - private int getBackShift( int curValue, int curIndex ) { - int shiftAmt = backShiftTable[hash(curValue)]; + private int m_textLimitOffset_; + /** + * Upon completion of a search, m_matchIndex_ will store starting offset in + * m_text for the match. The Value DONE is the default value. + * If we are not at the start of the text or the end of the text and + * m_matchedIndex_ is DONE it means that we can find any more matches in + * that particular direction + */ + private int m_matchedIndex_; + /** + * Current pattern to search for + */ + private Pattern m_pattern_; + /** + * Collator whose rules are used to perform the search + */ + private RuleBasedCollator m_collator_; + /** + * The collation element iterator for the text source. + */ + private CollationElementIterator m_colEIter_; + /** + * Utility collation element, used throughout program for temporary + * iteration. + */ + private CollationElementIterator m_utilColEIter_; + /** + * The mask used on the collation elements to retrieve the valid strength + * weight + */ + private int m_ceMask_; + /** + * Buffer storing accents during a canonical search + */ + private StringBuffer m_canonicalPrefixAccents_; + /** + * Buffer storing accents during a canonical search + */ + private StringBuffer m_canonicalSuffixAccents_; + /** + * Flag to indicate if canonical search is to be done. + * E.g looking for "a\u0300" in "a\u0318\u0300" will yield the match at 0. + */ + private boolean m_isCanonicalMatch_; + /** + * Size of the shift tables + */ + private static final int MAX_TABLE_SIZE_ = 257; + /** + * Initial array size + */ + private static final int INITIAL_ARRAY_SIZE_ = 256; + /** + * Utility mask + */ + private static final int SECOND_LAST_BYTE_SHIFT_ = 8; + /** + * Utility mask + */ + private static final int LAST_BYTE_MASK_ = 0xff; + /** + * Utility buffer for return values and temporary storage + */ + private int m_utilBuffer_[] = new int[2]; - // if (minLen != maxLen) { - int adjust = curIndex; - // int adjust = normLen - (minLen - curIndex); - if (adjust > 1 && shiftAmt > adjust) { - shiftAmt -= adjust - 1; - } - /* - if (shiftAmt > adjust + 1) { - if (DEBUG) debug("getBackShift: adjusting by " + adjust); - shiftAmt -= adjust; - } - */ - // } - return shiftAmt; - } + // private methods ------------------------------------------------------- /** * Hash a collation element from its full size (32 bits) down into a * value that can be used as an index into the shift tables. Right * now we do a modulus by the size of the hash table. - * - * TODO: At some point I should experiment to see whether a slightly - * more complicated hash function gives us a better distribution - * on multilingual text. I doubt it will have much effect on - * performance, though. + * @param ce collation element + * @return collapsed version of the collation element */ - private static final int hash(int order) { - return CollationElementIterator.primaryOrder(order) % MAX_TABLE; + private static final int hash(int ce) + { + // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work + // well with the new collation where most of the latin 1 characters + // are of the value xx000xxx. their hashes will most of the time be 0 + // to be discussed on the hash algo. + return CollationElementIterator.primaryOrder(ce) % MAX_TABLE_SIZE_; } - - - //------------------------------------------------------------------------- - // Debugging support... - //------------------------------------------------------------------------- - - static private final boolean DEBUG = false; - - static void debug(String str) { - System.out.println(str); - } - - void dumpTables() { - for (int i = 0; i < MAX_TABLE; i++) { - if (shiftTable[i] != minLen) { - debug("shift[" + Integer.toString(i,16) + "] = " + shiftTable[i]); + + /** + * Gets the fcd value for a character at the argument index. + * This method takes into accounts of the supplementary characters. + * Note this method changes the offset in the character iterator. + * @param str UTF16 string where character for fcd retrieval resides + * @param offset position of the character whose fcd is to be retrieved + * @return fcd value + */ + private static final char getFCD(CharacterIterator str, int offset) + { + str.setIndex(offset); + char ch = str.current(); + char result = NormalizerImpl.getFCD16(ch); + + if ((result != 0) && (str.getEndIndex() != offset + 1) && + UTF16.isLeadSurrogate(ch)) { + ch = str.next(); + if (UTF16.isTrailSurrogate(ch)) { + result = NormalizerImpl.getFCD16FromSurrogatePair(result, ch); + } else { + result = 0; + } + } + return result; + } + + /** + * Gets the fcd value for a character at the argument index. + * This method takes into accounts of the supplementary characters. + * @param str UTF16 string where character for fcd retrieval resides + * @param offset position of the character whose fcd is to be retrieved + * @return fcd value + */ + private static final char getFCD(String str, int offset) + { + char ch = str.charAt(offset); + char result = NormalizerImpl.getFCD16(ch); + + if ((result != 0) && (str.length() != offset + 1) && + UTF16.isLeadSurrogate(ch)) { + ch = str.charAt(offset + 1); + if (UTF16.isTrailSurrogate(ch)) { + result = NormalizerImpl.getFCD16FromSurrogatePair(result, ch); + } else { + result = 0; + } + } + return result; + } + + /** + * Getting the modified collation elements taking into account the collation + * attributes + * @param ce + * @return the modified collation element + */ + private final int getCE(int ce) + { + // note for tertiary we can't use the collator->tertiaryMask, that + // is a preprocessed mask that takes into account case options. since + // we are only concerned with exact matches, we don't need that. + ce &= m_ceMask_; + + if (m_collator_.isAlternateHandling(true)) { + // alternate handling here, since only the 16 most significant + // digits is only used, we can safely do a compare without masking + // if the ce is a variable, we mask and get only the primary values + // no shifting to quartenary is required since all primary values + // less than variabletop will need to be masked off anyway. + if ((m_collator_.m_variableTopValue_ << 16) > ce) { + if (m_collator_.getStrength() == Collator.QUATERNARY) { + ce = CollationElementIterator.primaryOrder(ce); + } + else { + ce = CollationElementIterator.IGNORABLE; + } + } + } + + return ce; + } + + /** + * Appends a int to a int array, increasing the size of the array when + * we are out of space. + * @param offset in array to append to + * @param value to append + * @param array to append to + * @return the array appended to, this could be a new and bigger array + */ + private static final int[] append(int offset, int value, int array[]) + { + if (offset >= array.length) { + int temp[] = new int[offset + INITIAL_ARRAY_SIZE_]; + System.arraycopy(array, 0, temp, 0, array.length); + array = temp; + } + array[offset] = value; + return array; + } + + /** + * Initializing the ce table for a pattern. Stores non-ignorable collation + * keys. Table size will be estimated by the size of the pattern text. + * Table expansion will be perform as we go along. Adding 1 to ensure that + * the table size definitely increases. + * Internal method, status assumed to be a success. + * @return total number of expansions + */ + private final int initializePatternCETable() + { + m_utilColEIter_.setText(m_pattern_.targetText); + + int offset = 0; + int result = 0; + int ce = m_utilColEIter_.next(); + + while (ce != CollationElementIterator.NULLORDER) { + int newce = getCE(ce); + if (newce != CollationElementIterator.IGNORABLE) { + m_pattern_.m_CE_ = append(offset, newce, m_pattern_.m_CE_); + offset ++; + } + result += m_utilColEIter_.getMaxExpansion(ce) - 1; + ce = m_utilColEIter_.next(); + } + + m_pattern_.m_CE_ = append(offset, 0, m_pattern_.m_CE_); + m_pattern_.m_CELength_ = offset; + + return result; + } + + /** + * Initializes the pattern struct. + * Internal method, status assumed to be success. + * @return expansionsize the total expansion size of the pattern + */ + private final int initializePattern() + { + m_pattern_.m_hasPrefixAccents_ = (getFCD(m_pattern_.targetText, 0) + >> SECOND_LAST_BYTE_SHIFT_) != 0; + m_pattern_.m_hasSuffixAccents_ = (getFCD(m_pattern_.targetText, + m_pattern_.targetText.length() + - 1) + & LAST_BYTE_MASK_) != 0; + // since intializePattern is an internal method status is a success. + return initializePatternCETable(); + } + + /** + * Initializing shift tables, with the default values. + * If a corresponding default value is 0, the shift table is not set. + * @param shift table for forwards shift + * @param backshift table for backwards shift + * @param cetable table containing pattern ce + * @param cesize size of the pattern ces + * @param expansionsize total size of the expansions + * @param defaultforward the default forward value + * @param defaultbackward the default backward value + */ + private final void setShiftTable(char shift[], + char backshift[], + int cetable[], int cesize, + int expansionsize, + char defaultforward, + char defaultbackward) + { + // estimate the value to shift. to do that we estimate the smallest + // number of characters to give the relevant ces, ie approximately + // the number of ces minus their expansion, since expansions can come + // from a character. + for (int count = 0; count < MAX_TABLE_SIZE_; count ++) { + shift[count] = defaultforward; + } + cesize --; // down to the last index + for (int count = 0; count < cesize; count ++) { + // number of ces from right of array to the count + int temp = defaultforward - count - 1; + shift[hash(cetable[count])] = temp > 1 ? ((char)temp) : 1; + } + shift[hash(cetable[cesize])] = 1; + // for ignorables we just shift by one. see test examples. + shift[hash(0)] = 1; + + for (int count = 0; count < MAX_TABLE_SIZE_; count ++) { + backshift[count] = defaultbackward; + } + for (int count = cesize; count > 0; count --) { + // the original value count does not seem to work + backshift[hash(cetable[count])] = (char)(count > expansionsize ? + count - expansionsize : 1); + } + backshift[hash(cetable[0])] = 1; + backshift[hash(0)] = 1; + } + + /** + *Building of the pattern collation element list and the Boyer Moore + * StringSearch table.
+ *The canonical match will only be performed after the default match + * fails.
+ *For both cases we need to remember the size of the composed and + * decomposed versions of the string. Since the Boyer-Moore shift + * calculations shifts by a number of characters in the text and tries to + * match the pattern from that offset, the shift value can not be too large + * in case we miss some characters. To choose a right shift size, we + * estimate the NFC form of the and use its size as a shift guide. The NFC + * form should be the small possible representation of the pattern. Anyways, + * we'll err on the smaller shift size. Hence the calculation for + * minlength. Canonical match will be performed slightly differently. We'll + * split the pattern into 3 parts, the prefix accents (PA), the middle + * string bounded by the first and last base character (MS), the ending + * accents (EA). Matches will be done on MS first, and only when we match + * MS then some processing will be required for the prefix and end accents + * in order to determine if they match PA and EA. Hence the default shift + * values for the canonical match will take the size of either end's accent + * into consideration. Forwards search will take the end accents into + * consideration for the default shift values and the backwards search will + * take the prefix accents into consideration.
+ *If pattern has no non-ignorable ce, we return a illegal argument + * error.
+ */ + private final void initialize() + { + int expandlength = initializePattern(); + if (m_pattern_.m_CELength_ > 0) { + char minlength = (char)(m_pattern_.m_CELength_ > expandlength + ? m_pattern_.m_CELength_ - expandlength : 1); + m_pattern_.m_defaultShiftSize_ = minlength; + setShiftTable(m_pattern_.m_shift_, m_pattern_.m_backShift_, + m_pattern_.m_CE_, m_pattern_.m_CELength_, + expandlength, minlength, minlength); + } + else { + m_pattern_.m_defaultShiftSize_ = 0; + } + } + + /** + * Determine whether the search text bounded by the offset start and end is + * one or more whole units of text as determined by the breakiterator in + * StringSearch. + * @param start target text start offset + * @param end target text end offset + */ + private final boolean isBreakUnit(int start, int end) + { + if (breakIterator != null) { + int startindex = breakIterator.first(); + int endindex = breakIterator.last(); + + // out-of-range indexes are never boundary positions + if (start < startindex || start > endindex || end < startindex + || end > endindex) { + return false; + } + // otherwise, we can use following() on the position before the + // specified one and return true of the position we get back is the + // one the user specified + boolean result = (start == startindex + || breakIterator.following(start - 1) == start) + && (end == endindex + || breakIterator.following(end - 1) == end); + if (result) { + // iterates the individual ces + m_utilColEIter_.setText(targetText); + m_utilColEIter_.setExactOffset(start); + for (int count = 0; count < m_pattern_.m_CELength_; + count ++) { + if (getCE(m_utilColEIter_.next()) + != m_pattern_.m_CE_[count]) { + return false; + } + } + if (m_utilColEIter_.next() + != CollationElementIterator.NULLORDER + && m_utilColEIter_.getOffset() == end) { + // extra collation elements at the end of the match + return false; + } + } + return result; + } + return true; + } + + /** + * Getting the next base character offset if current offset is an accent, + * or the current offset if the current character contains a base character. + * accents the following base character will be returned + * @param text string + * @param textoffset current offset + * @param textlength length of text string + * @return the next base character or the current offset + * if the current character is contains a base character. + */ + private final int getNextBaseOffset(CharacterIterator text, + int textoffset) + { + if (textoffset < text.getEndIndex()) { + while (text.getIndex() < text.getEndIndex()) { + int result = textoffset; + if ((getFCD(text, textoffset ++) + >> SECOND_LAST_BYTE_SHIFT_) == 0) { + return result; + } + } + return text.getEndIndex(); + } + return textoffset; + } + + /** + * Gets the next base character offset depending on the string search + * pattern data + * @param textoffset one offset away from the last character + * to search for. + * @return start index of the next base character or the current offset + * if the current character is contains a base character. + */ + private final int getNextBaseOffset(int textoffset) + { + if (m_pattern_.m_hasSuffixAccents_ + && textoffset < m_textLimitOffset_) { + targetText.setIndex(textoffset); + targetText.previous(); + if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) { + return getNextBaseOffset(targetText, textoffset); } - } - for (int i = 0; i < MAX_TABLE; i++) { - if (backShiftTable[i] != minLen) { - debug("backShift[" + Integer.toString(i,16) + "] = " + backShiftTable[i]); - } - } + } + return textoffset; + } + + /** + * Shifting the collation element iterator position forward to prepare for + * a following match. If the last character is a unsafe character, we'll + * only shift by 1 to capture contractions, normalization etc. + * Internal method, status assumed to be success. + * @param textoffset start text position to do search + * @param ce the text ce which failed the match. + * @param patternceindex index of the ce within the pattern ce buffer which + * failed the match + * @return final offset + */ + private int shiftForward(int textoffset, int ce, int patternceindex) + + { + if (isOverlapping()) { + if (textoffset > m_textBeginOffset_) { + textoffset ++; + } + else { + textoffset = m_pattern_.m_defaultShiftSize_; + } + } + else { + if (ce != CollationElementIterator.NULLORDER) { + int shift = m_pattern_.m_shift_[hash(ce)]; + // this is to adjust for characters in the middle of the + // substring for matching that failed. + int adjust = m_pattern_.m_CELength_ - patternceindex; + if (adjust > 1 && shift >= adjust) { + shift -= adjust - 1; + } + textoffset += shift; + } + else { + textoffset += m_pattern_.m_defaultShiftSize_; + } + } + + textoffset = getNextBaseOffset(textoffset); + // check for unsafe characters + // * if it is the start or middle of a contraction: to be done after + // a initial match is found + // * thai or lao base consonant character: similar to contraction + // * high surrogate character: similar to contraction + // * next character is a accent: shift to the next base character + return textoffset; + } + + /** + * Gets the offset to the next safe point in text. + * ie. not the middle of a contraction, swappable characters or + * supplementary characters. + * @param textoffset offset in string + * @param end offset in string + * @return offset to the next safe character + */ + private final int getNextSafeOffset(int textoffset, int end) + { + int result = textoffset; // first contraction character + targetText.setIndex(result); + while (result != end && + m_collator_.isUnsafe(targetText.current())) { + result ++; + targetText.setIndex(result); + } + return result; + } + + /** + * This checks for accents in the potential match started with a composite + * character. + * This is really painful... we have to check that composite character do + * not have any extra accents. We have to normalize the potential match and + * find the immediate decomposed character before the match. + * The first composite character would have been taken care of by the fcd + * checks in checkForwardExactMatch. + * This is the slow path after the fcd of the first character and + * the last character has been checked by checkForwardExactMatch and we + * determine that the potential match has extra non-ignorable preceding + * ces. + * E.g. looking for \u0301 acute in \u01FA A ring above and acute, + * checkExtraMatchAccent should fail since there is a middle ring in + * \u01FA Note here that accents checking are slow and cautioned in the API + * docs. + * Internal method, status assumed to be a success, caller should check + * status before calling this method + * @param start index of the potential unfriendly composite character + * @param end index of the potential unfriendly composite character + * @return true if there is non-ignorable accents before at the beginning + * of the match, false otherwise. + */ + private final boolean checkExtraMatchAccents(int start, int end) + { + boolean result = false; + if (m_pattern_.m_hasPrefixAccents_) { + targetText.setIndex(start); + + if (UTF16.isLeadSurrogate(targetText.next())) { + if (!UTF16.isTrailSurrogate(targetText.next())) { + targetText.previous(); + } + } + // we are only concerned with the first composite character + String str = getString(targetText, start, end); + if (Normalizer.quickCheck(str, Normalizer.NFD) + == Normalizer.NO) { + int safeoffset = getNextSafeOffset(start, end); + if (safeoffset != end) { + safeoffset ++; + } + String decomp = Normalizer.decompose( + str.substring(0, safeoffset - start), false); + m_utilColEIter_.setText(decomp); + int firstce = m_pattern_.m_CE_[0]; + boolean ignorable = true; + int ce = CollationElementIterator.IGNORABLE; + int offset = 0; + while (ce != firstce) { + offset = m_utilColEIter_.getOffset(); + if (ce != firstce + && ce != CollationElementIterator.IGNORABLE) { + ignorable = false; + } + ce = m_utilColEIter_.next(); + } + m_utilColEIter_.setExactOffset(offset); // back up 1 to the + m_utilColEIter_.previous(); // right offset + offset = m_utilColEIter_.getOffset(); + result = !ignorable && (UCharacter.getCombiningClass( + UTF16.charAt(decomp, offset)) != 0); + } + } + + return result; + } + + /** + * Used by exact matches, checks if there are accents before the match. + * This is really painful... we have to check that composite characters at + * the start of the matches have to not have any extra accents. + * We check the FCD of the character first, if it starts with an accent and + * the first pattern ce does not match the first ce of the character, we + * bail. + * Otherwise we try normalizing the first composite + * character and find the immediate decomposed character before the match to + * see if it is an non-ignorable accent. + * Now normalizing the first composite character is enough because we ensure + * that when the match is passed in here with extra beginning ces, the + * first or last ce that match has to occur within the first character. + * E.g. looking for \u0301 acute in \u01FA A ring above and acute, + * checkExtraMatchAccent should fail since there is a middle ring in \u01FA + * Note here that accents checking are slow and cautioned in the API docs. + * @param start offset + * @param end offset + * @return true if there are accents on either side of the match, + * false otherwise + */ + private final boolean hasAccentsBeforeMatch(int start, int end) + { + if (m_pattern_.m_hasPrefixAccents_) { + // we have been iterating forwards previously + boolean ignorable = true; + int firstce = m_pattern_.m_CE_[0]; + m_colEIter_.setExactOffset(start); + int ce = getCE(m_colEIter_.next()); + while (ce != firstce) { + if (ce != CollationElementIterator.IGNORABLE) { + ignorable = false; + } + ce = getCE(m_colEIter_.next()); + } + if (!ignorable && m_colEIter_.isInBuffer()) { + // within normalization buffer, discontiguous handled here + return true; + } + + // within text + boolean accent = (getFCD(targetText, start) >> SECOND_LAST_BYTE_SHIFT_) + != 0; + if (!accent) { + return checkExtraMatchAccents(start, end); + } + if (!ignorable) { + return true; + } + if (start > m_textBeginOffset_) { + targetText.setIndex(start); + targetText.previous(); + if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) + != 0) { + m_colEIter_.setExactOffset(start); + ce = m_colEIter_.previous(); + if (ce != CollationElementIterator.NULLORDER + && ce != CollationElementIterator.IGNORABLE) { + return true; + } + } + } + } + + return false; + } + + /** + * Used by exact matches, checks if there are accents bounding the match. + * Note this is the initial boundary check. If the potential match + * starts or ends with composite characters, the accents in those + * characters will be determined later. + * Not doing backwards iteration here, since discontiguos contraction for + * backwards collation element iterator, use up too many characters. + * E.g. looking for \u030A ring in \u01FA A ring above and acute, + * should fail since there is a acute at the end of \u01FA + * Note here that accents checking are slow and cautioned in the API docs. + * @param start offset of match + * @param end end offset of the match + * @return true if there are accents on either side of the match, + * false otherwise + */ + private final boolean hasAccentsAfterMatch(int start, int end) + { + if (m_pattern_.m_hasSuffixAccents_) { + targetText.setIndex(end); + if (end > m_textBeginOffset_ + && UTF16.isTrailSurrogate(targetText.previous())) { + if (targetText.getIndex() > m_textBeginOffset_ && + !UTF16.isLeadSurrogate(targetText.previous())) { + targetText.next(); + } + } + if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) != 0) { + int firstce = m_pattern_.m_CE_[0]; + m_colEIter_.setExactOffset(start); + while (getCE(m_colEIter_.next()) != firstce) { + } + int count = 1; + while (count < m_pattern_.m_CELength_) { + if (getCE(m_colEIter_.next()) + == CollationElementIterator.IGNORABLE) { + count --; + } + count ++; + } + int ce = getCE(m_colEIter_.next()); + if (ce != CollationElementIterator.NULLORDER + && ce != CollationElementIterator.IGNORABLE) { + if (m_colEIter_.getOffset() <= end) { + return true; + } + if ((getFCD(targetText, end) >> SECOND_LAST_BYTE_SHIFT_) + != 0) { + return true; + } + } + } + } + return false; + } + + /** + * Checks if the offset runs out of the text string range + * @param textstart offset of the first character in the range + * @param textlimit limit offset of the text string range + * @param offset to test + * @return true if offset is out of bounds, false otherwise + */ + private static final boolean isOutOfBounds(int textstart, int textlimit, + int offset) + { + return offset < textstart || offset > textlimit; + } + + /** + * Checks for identical match + * @param strsrch string search data + * @param start offset of possible match + * @param end offset of possible match + * @return true if identical match is found + */ + private final boolean checkIdentical(int start, int end) + { + if (m_collator_.getStrength() != Collator.IDENTICAL) { + return true; + } + + String textstr = getString(targetText, start, end - start); + if (Normalizer.quickCheck(textstr, Normalizer.NFD) + == Normalizer.NO) { + textstr = Normalizer.decompose(textstr, false); + } + String patternstr = m_pattern_.targetText; + if (Normalizer.quickCheck(patternstr, Normalizer.NFD) + == Normalizer.NO) { + patternstr = Normalizer.decompose(patternstr, false); + } + return textstr.equals(patternstr); + } + + /** + * Checks to see if the match is repeated + * @param start new match start index + * @param end new match end index + * @return true if the the match is repeated, false otherwise + */ + private final boolean checkRepeatedMatch(int start, int end) + { + if (m_matchedIndex_ == DONE) { + return false; + } + int lastmatchlimit = m_matchedIndex_ + matchLength; + if (!isOverlapping()) { + return (start >= m_matchedIndex_ && start <= lastmatchlimit) + || (end >= m_matchedIndex_ && end <= lastmatchlimit); + + } + return start == m_matchedIndex_; + } + + /** + * Checks match for contraction. + * If the match ends with a partial contraction we fail. + * If the match starts too far off (because of backwards iteration) we try + * to chip off the extra characters depending on whether a breakiterator + * has been used. + * Temporary utility buffer used to return modified start and end. + * @param start offset of potential match, to be modified if necessary + * @param end offset of potential match, to be modified if necessary + * @return true if match passes the contraction test, false otherwise. + */ + private final boolean checkNextExactContractionMatch(int start, int end) + { + // This part checks if either ends of the match contains potential + // contraction. If so we'll have to iterate through them + char endchar = 0; + if (end < m_textLimitOffset_) { + targetText.setIndex(end); + endchar = targetText.current(); + } + char poststartchar = 0; + if (start + 1 < m_textLimitOffset_) { + targetText.setIndex(start + 1); + poststartchar = targetText.current(); + } + if (m_collator_.isUnsafe(endchar) + || m_collator_.isUnsafe(poststartchar)) { + // expansion prefix, what's left to iterate + int bufferedCEOffset = m_colEIter_.m_CEBufferOffset_; + boolean hasBufferedCE = bufferedCEOffset > 0; + m_colEIter_.setExactOffset(start); + int temp = start; + while (bufferedCEOffset > 0) { + // getting rid of the redundant ce, caused by setOffset. + // since backward contraction/expansion may have extra ces if + // we are in the normalization buffer, hasAccentsBeforeMatch + // would have taken care of it. + // E.g. the character \u01FA will have an expansion of 3, but + // if we are only looking for acute and ring \u030A and \u0301, + // we'll have to skip the first ce in the expansion buffer. + m_colEIter_.next(); + if (m_colEIter_.getOffset() != temp) { + start = temp; + temp = m_colEIter_.getOffset(); + } + bufferedCEOffset --; + } + + int count = 0; + while (count < m_pattern_.m_CELength_) { + int ce = getCE(m_colEIter_.next()); + if (ce == CollationElementIterator.IGNORABLE) { + continue; + } + if (hasBufferedCE && count == 0 + && m_colEIter_.getOffset() != temp) { + start = temp; + temp = m_colEIter_.getOffset(); + } + if (ce != m_pattern_.m_CE_[count]) { + end ++; + end = getNextBaseOffset(end); + m_utilBuffer_[0] = start; + m_utilBuffer_[1] = end; + return false; + } + count ++; + } + } + m_utilBuffer_[0] = start; + m_utilBuffer_[1] = end; + return true; + } + + + /** + * Checks and sets the match information if found. + * Checks + *+ *
+ * Otherwise the offset will be shifted to the next character. + * The result m_matchIndex_ and m_matchLength_ will be set to the truncated + * more fitting result value. + * Uses the temporary utility buffer for storing the modified textoffset. + * @param textoffset offset in the collation element text. + * @return true if the match is valid, false otherwise + */ + private final boolean checkNextExactMatch(int textoffset) + { + int start = m_colEIter_.getOffset(); + if (!checkNextExactContractionMatch(start, textoffset)) { + // returns the modified textoffset + m_utilBuffer_[0] = m_utilBuffer_[1]; + return false; + } + + start = m_utilBuffer_[0]; + textoffset = m_utilBuffer_[1]; + // this totally matches, however we need to check if it is repeating + if (!isBreakUnit(start, textoffset) + || checkRepeatedMatch(start, textoffset) + || hasAccentsBeforeMatch(start, textoffset) + || !checkIdentical(start, textoffset) + || hasAccentsAfterMatch(start, textoffset)) { + textoffset ++; + textoffset = getNextBaseOffset(textoffset); + m_utilBuffer_[0] = textoffset; + return false; + } + + // totally match, we will get rid of the ending ignorables. + m_matchedIndex_ = start; + matchLength = textoffset - start; + return true; + } + + /** + * Getting the previous base character offset, or the current offset if the + * current character is a base character + * @param text the source text to work on + * @param textoffset one offset after the current character + * @return the offset of the next character after the base character or the + * first composed character with accents + */ + private final int getPreviousBaseOffset(CharacterIterator text, + int textoffset) + { + if (textoffset > m_textBeginOffset_) { + while (true) { + int result = textoffset; + text.setIndex(result); + if (UTF16.isTrailSurrogate(text.previous())) { + if (text.getIndex() != text.getBeginIndex() && + !UTF16.isLeadSurrogate(text.previous())) { + text.next(); + } + } + textoffset = text.getIndex(); + char fcd = getFCD(text, textoffset); + if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { + if ((fcd & LAST_BYTE_MASK_) != 0) { + return textoffset; + } + return result; + } + if (textoffset == m_textBeginOffset_) { + return m_textBeginOffset_; + } + } + } + return textoffset; + } + + /** + * Getting the indexes of the accents that are not blocked in the argument + * accent array + * @param accents accents in nfd. + * @param accentsindex array to store the indexes of accents in accents that + * are not blocked + * @return the length of populated accentsindex + */ + private int getUnblockedAccentIndex(StringBuffer accents, + int accentsindex[]) + { + int index = 0; + int length = accents.length(); + int cclass = 0; + int result = 0; + while (index < length) { + int codepoint = UTF16.charAt(accents, index); + int tempclass = UCharacter.getCombiningClass(codepoint); + if (tempclass != cclass) { + cclass = tempclass; + accentsindex[result] = index; + result ++; + } + if (UCharacter.isSupplementary(codepoint)) { + index += 2; + } + else { + index ++; + } + } + accentsindex[result] = length; + return result; + } + + /** + * Appends 3 StringBuffer/CharacterIterator together into a destination + * string buffer. + * @param source1 string buffer + * @param source2 character iterator + * @param start2 start of the character iterator to merge + * @param end2 end of the character iterator to merge + * @param source3 string buffer + * @return appended string buffer + */ + private static final StringBuffer merge(StringBuffer source1, + CharacterIterator source2, + int start2, int end2, + StringBuffer source3) + { + StringBuffer result = new StringBuffer(); + if (source1 != null && source1.length() != 0) { + result.append(source1); + } + source2.setIndex(start2); + while (source2.getIndex() < end2) { + result.append(source2.current()); + source2.next(); + } + if (source3 != null && source3.length() != 0) { + result.append(source3); + } + return result; + } + + /** + * Running through a collation element iterator to see if the contents + * matches pattern in string search data + * @param coleiter collation element iterator to test + * @return true if a match if found, false otherwise + */ + private final boolean checkCollationMatch(CollationElementIterator coleiter) + { + int patternceindex = m_pattern_.m_CELength_; + int offset = 0; + while (patternceindex > 0) { + int ce = getCE(coleiter.next()); + if (ce == CollationElementIterator.IGNORABLE) { + continue; + } + if (ce != m_pattern_.m_CE_[offset]) { + return false; + } + offset ++; + patternceindex --; + } + return true; + } + + /** + * Rearranges the front accents to try matching. + * Prefix accents in the text will be grouped according to their combining + * class and the groups will be mixed and matched to try find the perfect + * match with the pattern. + * So for instance looking for "\u0301" in "\u030A\u0301\u0325" + * step 1: split "\u030A\u0301" into 6 other type of potential accent + * substrings "\u030A", "\u0301", "\u0325", "\u030A\u0301", + * "\u030A\u0325", "\u0301\u0325". + * step 2: check if any of the generated substrings matches the pattern. + * Internal method, status is assumed to be success, caller has to check + * status before calling this method. + * @param start first offset of the accents to start searching + * @param end start of the last accent set + * @return DONE if a match is not found, otherwise return the starting + * offset of the match. Note this start includes all preceding + * accents. + */ + private int doNextCanonicalPrefixMatch(int start, int end) + { + if ((getFCD(targetText, start) & LAST_BYTE_MASK_) == 0) { + // die... failed at a base character + return DONE; + } + + start = targetText.getIndex(); // index changed by fcd + int offset = getNextBaseOffset(targetText, start); + start = getPreviousBaseOffset(start); + + StringBuffer accents = new StringBuffer(); + String accentstr = getString(targetText, start, offset - start); + // normalizing the offensive string + if (Normalizer.quickCheck(accentstr, Normalizer.NFD) + == Normalizer.NO) { + accentstr = Normalizer.decompose(accentstr, false); + } + accents.append(accentstr); + + int accentsindex[] = new int[INITIAL_ARRAY_SIZE_]; + int accentsize = getUnblockedAccentIndex(accents, accentsindex); + int count = (2 << (accentsize - 1)) - 2; + while (count > 0) { + // copy the base characters + m_canonicalPrefixAccents_.delete(0, + m_canonicalPrefixAccents_.length()); + int k = 0; + for (; k < accentsindex[0]; k ++) { + m_canonicalPrefixAccents_.append(accents.charAt(k)); + } + // forming all possible canonical rearrangement by dropping + // sets of accents + for (int i = 0; i <= accentsize - 1; i ++) { + int mask = 1 << (accentsize - i - 1); + if ((count & mask) != 0) { + for (int j = accentsindex[i]; j < accentsindex[i + 1]; + j ++) { + m_canonicalPrefixAccents_.append(accents.charAt(j)); + } + } + } + StringBuffer match = merge(m_canonicalPrefixAccents_, + targetText, offset, end, + m_canonicalSuffixAccents_); + + // if status is a failure, ucol_setText does nothing. + // run the collator iterator through this match + m_utilColEIter_.setText(match.toString()); + if (checkCollationMatch(m_utilColEIter_)) { + return start; + } + count --; + } + return DONE; + } + + /** + * Gets the offset to the safe point in text before textoffset. + * ie. not the middle of a contraction, swappable characters or + * supplementary characters. + * @param start offset in string + * @param textoffset offset in string + * @return offset to the previous safe character + */ + private final int getPreviousSafeOffset(int start, int textoffset) + { + int result = textoffset; // first contraction character + targetText.setIndex(textoffset); + while (result >= start && m_collator_.isUnsafe(targetText.previous())) { + result = targetText.getIndex(); + } + if (result != start) { + // the first contraction character is consider unsafe here + result = targetText.getIndex(); // originally result --; + } + return result; + } + + /** + * Take the rearranged end accents and tries matching. If match failed at + * a seperate preceding set of accents (seperated from the rearranged on by + * at least a base character) then we rearrange the preceding accents and + * tries matching again. + * We allow skipping of the ends of the accent set if the ces do not match. + * However if the failure is found before the accent set, it fails. + * Internal method, status assumed to be success, caller has to check + * status before calling this method. + * @param textoffset of the start of the rearranged accent + * @return DONE if a match is not found, otherwise return the starting + * offset of the match. Note this start includes all preceding + * accents. + */ + private int doNextCanonicalSuffixMatch(int textoffset) + { + int safelength = 0; + StringBuffer safetext; + int safeoffset = m_textBeginOffset_; + + if (textoffset != m_textBeginOffset_ + && m_canonicalSuffixAccents_.length() > 0 + && m_collator_.isUnsafe(m_canonicalSuffixAccents_.charAt(0))) { + safeoffset = getPreviousSafeOffset(m_textBeginOffset_, + textoffset); + safelength = textoffset - safeoffset; + safetext = merge(null, targetText, safeoffset, textoffset, + m_canonicalSuffixAccents_); + } + else { + safetext = m_canonicalSuffixAccents_; + } + + // if status is a failure, ucol_setText does nothing + CollationElementIterator coleiter = m_utilColEIter_; + coleiter.setText(safetext.toString()); + // status checked in loop below + + int ceindex = m_pattern_.m_CELength_ - 1; + boolean isSafe = true; // indication flag for position in safe zone + + while (ceindex >= 0) { + int textce = coleiter.previous(); + if (textce == CollationElementIterator.NULLORDER) { + // check if we have passed the safe buffer + if (coleiter == m_colEIter_) { + return DONE; + } + coleiter = m_colEIter_; + if (safetext != m_canonicalSuffixAccents_) { + safetext.delete(0, safetext.length()); + } + coleiter.setExactOffset(safeoffset); + // status checked at the start of the loop + isSafe = false; + continue; + } + textce = getCE(textce); + if (textce != CollationElementIterator.IGNORABLE + && textce != m_pattern_.m_CE_[ceindex]) { + // do the beginning stuff + int failedoffset = coleiter.getOffset(); + if (isSafe && failedoffset >= safelength) { + // alas... no hope. failed at rearranged accent set + return DONE; + } + else { + if (isSafe) { + failedoffset += safeoffset; + } + + // try rearranging the front accents + int result = doNextCanonicalPrefixMatch(failedoffset, + textoffset); + if (result != DONE) { + // if status is a failure, ucol_setOffset does nothing + m_colEIter_.setExactOffset(result); + } + return result; + } + } + if (textce == m_pattern_.m_CE_[ceindex]) { + ceindex --; + } + } + // set offset here + if (isSafe) { + int result = coleiter.getOffset(); + // sets the text iterator with the correct expansion and offset + int leftoverces = coleiter.m_CEBufferOffset_; + if (result >= safelength) { + result = textoffset; + } + else { + result += safeoffset; + } + m_colEIter_.setExactOffset(result); + m_colEIter_.m_CEBufferOffset_ = leftoverces; + return result; + } + + return coleiter.getOffset(); + } + + /** + * Trying out the substring and sees if it can be a canonical match. + * This will try normalizing the end accents and arranging them into + * canonical equivalents and check their corresponding ces with the pattern + * ce. + * Suffix accents in the text will be grouped according to their combining + * class and the groups will be mixed and matched to try find the perfect + * match with the pattern. + * So for instance looking for "\u0301" in "\u030A\u0301\u0325" + * step 1: split "\u030A\u0301" into 6 other type of potential accent + * substrings + * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", + * "\u0301\u0325". + * step 2: check if any of the generated substrings matches the pattern. + * @param textoffset end offset in the collation element text that ends with + * the accents to be rearranged + * @return true if the match is valid, false otherwise + */ + private boolean doNextCanonicalMatch(int textoffset) + { + int offset = m_colEIter_.getOffset(); + targetText.setIndex(textoffset); + if (UTF16.isTrailSurrogate(targetText.previous()) + && targetText.getIndex() > m_textBeginOffset_) { + if (!UTF16.isLeadSurrogate(targetText.previous())) { + targetText.next(); + } + } + if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) { + if (m_pattern_.m_hasPrefixAccents_) { + offset = doNextCanonicalPrefixMatch(offset, textoffset); + if (offset != DONE) { + m_colEIter_.setExactOffset(offset); + return true; + } + } + return false; + } + + if (!m_pattern_.m_hasSuffixAccents_) { + return false; + } + + StringBuffer accents = new StringBuffer(); + // offset to the last base character in substring to search + int baseoffset = getPreviousBaseOffset(targetText, textoffset); + // normalizing the offensive string + String accentstr = getString(targetText, baseoffset, + textoffset - baseoffset); + if (Normalizer.quickCheck(accentstr, Normalizer.NFD) + == Normalizer.NO) { + accentstr = Normalizer.decompose(accentstr, false); + } + accents.append(accentstr); + // status checked in loop below + + int accentsindex[] = new int[INITIAL_ARRAY_SIZE_]; + int size = getUnblockedAccentIndex(accents, accentsindex); + + // 2 power n - 1 minus the full set of accents + int count = (2 << (size - 1)) - 2; + while (count > 0) { + m_canonicalSuffixAccents_.delete(0, + m_canonicalSuffixAccents_.length()); + // copy the base characters + for (int k = 0; k < accentsindex[0]; k ++) { + m_canonicalSuffixAccents_.append(accents.charAt(k)); + } + // forming all possible canonical rearrangement by dropping + // sets of accents + for (int i = 0; i <= size - 1; i ++) { + int mask = 1 << (size - i - 1); + if ((count & mask) != 0) { + for (int j = accentsindex[i]; j < accentsindex[i + 1]; + j ++) { + m_canonicalSuffixAccents_.append(accents.charAt(j)); + } + } + } + offset = doNextCanonicalSuffixMatch(baseoffset); + if (offset != DONE) { + return true; // match found + } + count --; + } + return false; + } + + /** + * Gets the previous base character offset depending on the string search + * pattern data + * @param strsrch string search data + * @param textoffset current offset, current character + * @return the offset of the next character after this base character or + * itself if it is a composed character with accents + */ + private final int getPreviousBaseOffset(int textoffset) + { + if (m_pattern_.m_hasPrefixAccents_ && textoffset > m_textBeginOffset_) { + int offset = textoffset; + if ((getFCD(targetText, offset) >> SECOND_LAST_BYTE_SHIFT_) != 0) { + return getPreviousBaseOffset(targetText, textoffset); + } + } + return textoffset; + } + + /** + * Checks match for contraction. + * If the match ends with a partial contraction we fail. + * If the match starts too far off (because of backwards iteration) we try + * to chip off the extra characters. + * Uses the temporary util buffer for return values of the modified start + * and end. + * @param start offset of potential match, to be modified if necessary + * @param end offset of potential match, to be modified if necessary + * @return true if match passes the contraction test, false otherwise. + */ + private boolean checkNextCanonicalContractionMatch(int start, int end) + { + // This part checks if either ends of the match contains potential + // contraction. If so we'll have to iterate through them + char schar = 0; + char echar = 0; + if (end < m_textLimitOffset_) { + targetText.setIndex(end); + echar = targetText.current(); + } + if (start < m_textLimitOffset_) { + targetText.setIndex(start + 1); + schar = targetText.current(); + } + if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) { + int expansion = m_colEIter_.m_CEBufferOffset_; + boolean hasExpansion = expansion > 0; + m_colEIter_.setExactOffset(start); + int temp = start; + while (expansion > 0) { + // getting rid of the redundant ce, caused by setOffset. + // since backward contraction/expansion may have extra ces if + // we are in the normalization buffer, hasAccentsBeforeMatch + // would have taken care of it. + // E.g. the character \u01FA will have an expansion of 3, but + // if we are only looking for acute and ring \u030A and \u0301, + // we'll have to skip the first ce in the expansion buffer. + m_colEIter_.next(); + if (m_colEIter_.getOffset() != temp) { + start = temp; + temp = m_colEIter_.getOffset(); + } + expansion --; + } + + int count = 0; + while (count < m_pattern_.m_CELength_) { + int ce = getCE(m_colEIter_.next()); + // status checked below, note that if status is a failure + // ucol_next returns UCOL_NULLORDER + if (ce == CollationElementIterator.IGNORABLE) { + continue; + } + if (hasExpansion && count == 0 + && m_colEIter_.getOffset() != temp) { + start = temp; + temp = m_colEIter_.getOffset(); + } + + if (count == 0 && ce != m_pattern_.m_CE_[0]) { + // accents may have extra starting ces, this occurs when a + // pure accent pattern is matched without rearrangement + // text \u0325\u0300 and looking for \u0300 + int expected = m_pattern_.m_CE_[0]; + if ((getFCD(targetText, start) & LAST_BYTE_MASK_) != 0) { + ce = getCE(m_colEIter_.next()); + while (ce != expected + && ce != CollationElementIterator.NULLORDER + && m_colEIter_.getOffset() <= end) { + ce = getCE(m_colEIter_.next()); + } + } + } + if (ce != m_pattern_.m_CE_[count]) { + end ++; + end = getNextBaseOffset(end); + m_utilBuffer_[0] = start; + m_utilBuffer_[1] = end; + return false; + } + count ++; + } + } + m_utilBuffer_[0] = start; + m_utilBuffer_[1] = end; + return true; + } + + /** + * Checks and sets the match information if found. + * Checks + *- the potential match does not repeat the previous match + *
- boundaries are correct + *
- exact matches has no extra accents + *
- identical matchesb + *
- potential match does not end in the middle of a contraction + *
+ *
+ * Otherwise the offset will be shifted to the next character. + * The result m_matchIndex_ and m_matchLength_ will be set to the truncated + * more fitting result value. + * Uses the temporary utility buffer for storing the modified textoffset. + * @param textoffset offset in the collation element text. + * @return true if the match is valid, false otherwise + */ + private boolean checkNextCanonicalMatch(int textoffset) + { + // to ensure that the start and ends are not composite characters + // if we have a canonical accent match + if ((m_pattern_.m_hasSuffixAccents_ + && m_canonicalSuffixAccents_.length() != 0) || + (m_pattern_.m_hasPrefixAccents_ + && m_canonicalPrefixAccents_.length() != 0)) { + m_matchedIndex_ = getPreviousBaseOffset(m_colEIter_.getOffset()); + matchLength = textoffset - m_matchedIndex_; + return true; + } + + int start = m_colEIter_.getOffset(); + if (!checkNextCanonicalContractionMatch(start, textoffset)) { + // return the modified textoffset + m_utilBuffer_[0] = m_utilBuffer_[1]; + return false; + } + start = m_utilBuffer_[0]; + textoffset = m_utilBuffer_[1]; + start = getPreviousBaseOffset(start); + // this totally matches, however we need to check if it is repeating + if (checkRepeatedMatch(start, textoffset) + || !isBreakUnit(start, textoffset) + || !checkIdentical(start, textoffset)) { + textoffset ++; + textoffset = getNextBaseOffset(targetText, textoffset); + m_utilBuffer_[0] = textoffset; + return false; + } + + m_matchedIndex_ = start; + matchLength = textoffset - start; + return true; + } + + /** + * Shifting the collation element iterator position forward to prepare for + * a preceding match. If the first character is a unsafe character, we'll + * only shift by 1 to capture contractions, normalization etc. + * @param textoffset start text position to do search + * @param ce the text ce which failed the match. + * @param patternceindex index of the ce within the pattern ce buffer which + * failed the match + * @return final offset + */ + private int reverseShift(int textoffset, int ce, int patternceindex) + { + if (isOverlapping()) { + if (textoffset != m_textLimitOffset_) { + textoffset --; + } + else { + textoffset -= m_pattern_.m_defaultShiftSize_; + } + } + else { + if (ce != CollationElementIterator.NULLORDER) { + int shift = m_pattern_.m_backShift_[hash(ce)]; + + // this is to adjust for characters in the middle of the substring + // for matching that failed. + int adjust = patternceindex; + if (adjust > 1 && shift > adjust) { + shift -= adjust - 1; + } + textoffset -= shift; + } + else { + textoffset -= m_pattern_.m_defaultShiftSize_; + } + } + + textoffset = getPreviousBaseOffset(textoffset); + return textoffset; + } + + /** + * Checks match for contraction. + * If the match starts with a partial contraction we fail. + * Uses the temporary utility buffer to return the modified start and end. + * @param start offset of potential match, to be modified if necessary + * @param end offset of potential match, to be modified if necessary + * @return true if match passes the contraction test, false otherwise. + */ + private boolean checkPreviousExactContractionMatch(int start, int end) + { + // This part checks if either ends of the match contains potential + // contraction. If so we'll have to iterate through them + char echar = 0; + if (end < m_textLimitOffset_) { + targetText.setIndex(end); + echar = targetText.current(); + } + char schar = 0; + if (start + 1 < m_textLimitOffset_) { + targetText.setIndex(start + 1); + schar = targetText.current(); + } + if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) { + // expansion suffix, what's left to iterate + int expansion = m_colEIter_.m_CEBufferSize_ + - m_colEIter_.m_CEBufferOffset_; + boolean hasExpansion = expansion > 0; + m_colEIter_.setExactOffset(end); + int temp = end; + while (expansion > 0) { + // getting rid of the redundant ce + // since forward contraction/expansion may have extra ces + // if we are in the normalization buffer, hasAccentsBeforeMatch + // would have taken care of it. + // E.g. the character \u01FA will have an expansion of 3, but if + // we are only looking for A ring A\u030A, we'll have to skip the + // last ce in the expansion buffer + m_colEIter_.previous(); + if (m_colEIter_.getOffset() != temp) { + end = temp; + temp = m_colEIter_.getOffset(); + } + expansion --; + } + + int count = m_pattern_.m_CELength_; + while (count > 0) { + int ce = getCE(m_colEIter_.previous()); + // status checked below, note that if status is a failure + // ucol_previous returns UCOL_NULLORDER + if (ce == CollationElementIterator.IGNORABLE) { + continue; + } + if (hasExpansion && count == 0 + && m_colEIter_.getOffset() != temp) { + end = temp; + temp = m_colEIter_.getOffset(); + } + if (ce != m_pattern_.m_CE_[count - 1]) { + start --; + start = getPreviousBaseOffset(targetText, start); + m_utilBuffer_[0] = start; + m_utilBuffer_[1] = end; + return false; + } + count --; + } + } + m_utilBuffer_[0] = start; + m_utilBuffer_[1] = end; + return true; + } + + /** + * Checks and sets the match information if found. + * Checks + *- the potential match does not repeat the previous match + *
- boundaries are correct + *
- potential match does not end in the middle of a contraction + *
- identical matches + *
+ *
+ * Otherwise the offset will be shifted to the preceding character. + * Uses the temporary utility buffer to store the modified textoffset. + * @param textoffset offset in the collation element text. the returned value + * will be the truncated start offset of the match or the new start + * search offset. + * @return true if the match is valid, false otherwise + */ + private final boolean checkPreviousExactMatch(int textoffset) + { + // to ensure that the start and ends are not composite characters + int end = m_colEIter_.getOffset(); + if (!checkPreviousExactContractionMatch(textoffset, end)) { + return false; + } + textoffset = m_utilBuffer_[0]; + end = m_utilBuffer_[1]; + + // this totally matches, however we need to check if it is repeating + // the old match + if (checkRepeatedMatch(textoffset, end) + || !isBreakUnit(textoffset, end) + || hasAccentsBeforeMatch(textoffset, end) + || !checkIdentical(textoffset, end) + || hasAccentsAfterMatch(textoffset, end)) { + textoffset --; + textoffset = getPreviousBaseOffset(targetText, textoffset); + m_utilBuffer_[0] = textoffset; + return false; + } + m_matchedIndex_ = textoffset; + matchLength = end - textoffset; + return true; + } + + /** + * Rearranges the end accents to try matching. + * Suffix accents in the text will be grouped according to their combining + * class and the groups will be mixed and matched to try find the perfect + * match with the pattern. + * So for instance looking for "\u0301" in "\u030A\u0301\u0325" + * step 1: split "\u030A\u0301" into 6 other type of potential accent + * substrings + * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", + * "\u0301\u0325". + * step 2: check if any of the generated substrings matches the pattern. + * @param start offset of the first base character + * @param end start of the last accent set + * @return DONE if a match is not found, otherwise return the ending + * offset of the match. Note this start includes all following + * accents. + */ + private int doPreviousCanonicalSuffixMatch(int start, int end) + { + targetText.setIndex(end); + if (UTF16.isTrailSurrogate(targetText.previous()) + && targetText.getIndex() > m_textBeginOffset_) { + if (!UTF16.isLeadSurrogate(targetText.previous())) { + targetText.next(); + } + } + if ((getFCD(targetText, targetText.getIndex()) & LAST_BYTE_MASK_) == 0) { + // die... failed at a base character + return DONE; + } + end = getNextBaseOffset(targetText, end); + + StringBuffer accents = new StringBuffer(); + int offset = getPreviousBaseOffset(targetText, end); + // normalizing the offensive string + String accentstr = getString(targetText, offset, end - offset); + if (Normalizer.quickCheck(accentstr, Normalizer.NFD) + == Normalizer.NO) { + accentstr = Normalizer.decompose(accentstr, false); + } + accents.append(accentstr); + + int accentsindex[] = new int[INITIAL_ARRAY_SIZE_]; + int accentsize = getUnblockedAccentIndex(accents, accentsindex); + int count = (2 << (accentsize - 1)) - 2; + while (count > 0) { + m_canonicalSuffixAccents_.delete(0, + m_canonicalSuffixAccents_.length()); + // copy the base characters + for (int k = 0; k < accentsindex[0]; k ++) { + m_canonicalSuffixAccents_.append(accents.charAt(k)); + } + // forming all possible canonical rearrangement by dropping + // sets of accents + for (int i = 0; i <= accentsize - 1; i ++) { + int mask = 1 << (accentsize - i - 1); + if ((count & mask) != 0) { + for (int j = accentsindex[i]; j < accentsindex[i + 1]; + j ++) { + m_canonicalSuffixAccents_.append(accents.charAt(j)); + } + } + } + StringBuffer match = merge(m_canonicalPrefixAccents_, targetText, + start, offset, + m_canonicalSuffixAccents_); + // run the collator iterator through this match + // if status is a failure ucol_setText does nothing + m_utilColEIter_.setText(match.toString()); + if (checkCollationMatch(m_utilColEIter_)) { + return end; + } + count --; + } + return DONE; + } + + /** + * Take the rearranged start accents and tries matching. If match failed at + * a seperate following set of accents (seperated from the rearranged on by + * at least a base character) then we rearrange the preceding accents and + * tries matching again. + * We allow skipping of the ends of the accent set if the ces do not match. + * However if the failure is found before the accent set, it fails. + * Internal method, status assumed to be success, caller has to check + * status before calling this method. + * @param textoffset of the ends of the rearranged accent + * @return DONE if a match is not found, otherwise return the ending offset + * of the match. Note this start includes all following accents. + */ + private int doPreviousCanonicalPrefixMatch(int textoffset) + { + int safelength = 0; + StringBuffer safetext; + int safeoffset = textoffset; + + if (textoffset > m_textBeginOffset_ + && m_collator_.isUnsafe(m_canonicalPrefixAccents_.charAt( + m_canonicalPrefixAccents_.length() - 1))) { + safeoffset = getNextSafeOffset(textoffset, m_textLimitOffset_); + safelength = safeoffset - textoffset; + safetext = merge(m_canonicalPrefixAccents_, targetText, textoffset, + safeoffset, null); + } + else { + safetext = m_canonicalPrefixAccents_; + } + + // if status is a failure, ucol_setText does nothing + CollationElementIterator coleiter = m_utilColEIter_; + coleiter.setText(safetext.toString()); + // status checked in loop below + + int ceindex = 0; + boolean isSafe = true; // safe zone indication flag for position + int prefixlength = m_canonicalPrefixAccents_.length(); + + while (ceindex < m_pattern_.m_CELength_) { + int textce = coleiter.next(); + if (textce == CollationElementIterator.NULLORDER) { + // check if we have passed the safe buffer + if (coleiter == m_colEIter_) { + return DONE; + } + if (safetext != m_canonicalPrefixAccents_) { + safetext.delete(0, safetext.length()); + } + coleiter = m_colEIter_; + coleiter.setExactOffset(safeoffset); + // status checked at the start of the loop + isSafe = false; + continue; + } + textce = getCE(textce); + if (textce != CollationElementIterator.IGNORABLE + && textce != m_pattern_.m_CE_[ceindex]) { + // do the beginning stuff + int failedoffset = coleiter.getOffset(); + if (isSafe && failedoffset <= prefixlength) { + // alas... no hope. failed at rearranged accent set + return DONE; + } + else { + if (isSafe) { + failedoffset = safeoffset - failedoffset; + if (safetext != m_canonicalPrefixAccents_) { + safetext.delete(0, safetext.length()); + } + } + + // try rearranging the end accents + int result = doPreviousCanonicalSuffixMatch(textoffset, + failedoffset); + if (result != DONE) { + // if status is a failure, ucol_setOffset does nothing + m_colEIter_.setExactOffset(result); + } + return result; + } + } + if (textce == m_pattern_.m_CE_[ceindex]) { + ceindex ++; + } + } + // set offset here + if (isSafe) { + int result = coleiter.getOffset(); + // sets the text iterator here with the correct expansion and offset + int leftoverces = coleiter.m_CEBufferSize_ + - coleiter.m_CEBufferOffset_; + if (result <= prefixlength) { + result = textoffset; + } + else { + result = textoffset + (safeoffset - result); + } + m_colEIter_.setExactOffset(result); + m_colEIter_.m_CEBufferOffset_ = m_colEIter_.m_CEBufferSize_ + - leftoverces; + return result; + } + + return coleiter.getOffset(); + } + + /** + * Trying out the substring and sees if it can be a canonical match. + * This will try normalizing the starting accents and arranging them into + * canonical equivalents and check their corresponding ces with the pattern + * ce. + * Prefix accents in the text will be grouped according to their combining + * class and the groups will be mixed and matched to try find the perfect + * match with the pattern. + * So for instance looking for "\u0301" in "\u030A\u0301\u0325" + * step 1: split "\u030A\u0301" into 6 other type of potential accent + * substrings + * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", + * "\u0301\u0325". + * step 2: check if any of the generated substrings matches the pattern. + * @param textoffset start offset in the collation element text that starts + * with the accents to be rearranged + * @return true if the match is valid, false otherwise + */ + private boolean doPreviousCanonicalMatch(int textoffset) + { + int offset = m_colEIter_.getOffset(); + if ((getFCD(targetText, textoffset) >> SECOND_LAST_BYTE_SHIFT_) == 0) { + if (m_pattern_.m_hasSuffixAccents_) { + offset = doPreviousCanonicalSuffixMatch(textoffset, offset); + if (offset != DONE) { + m_colEIter_.setExactOffset(offset); + return true; + } + } + return false; + } + + if (!m_pattern_.m_hasPrefixAccents_) { + return false; + } + + StringBuffer accents = new StringBuffer(); + // offset to the last base character in substring to search + int baseoffset = getNextBaseOffset(targetText, textoffset); + // normalizing the offensive string + String textstr = getString(targetText, textoffset, + baseoffset - textoffset); + if (Normalizer.quickCheck(textstr, Normalizer.NFD) + == Normalizer.NO) { + textstr = Normalizer.decompose(textstr, false); + } + accents.append(textstr); + // status checked in loop + + int accentsindex[] = new int[INITIAL_ARRAY_SIZE_]; + int size = getUnblockedAccentIndex(accents, accentsindex); + + // 2 power n - 1 minus the full set of accents + int count = (2 << (size - 1)) - 2; + while (count > 0) { + m_canonicalPrefixAccents_.delete(0, + m_canonicalPrefixAccents_.length()); + // copy the base characters + for (int k = 0; k < accentsindex[0]; k ++) { + m_canonicalPrefixAccents_.append(accents.charAt(k)); + } + // forming all possible canonical rearrangement by dropping + // sets of accents + for (int i = 0; i <= size - 1; i ++) { + int mask = 1 << (size - i - 1); + if ((count & mask) != 0) { + for (int j = accentsindex[i]; j < accentsindex[i + 1]; + j ++) { + m_canonicalPrefixAccents_.append(accents.charAt(j)); + } + } + } + offset = doPreviousCanonicalPrefixMatch(baseoffset); + if (offset != DONE) { + return true; // match found + } + count --; + } + return false; + } + + /** + * Checks match for contraction. + * If the match starts with a partial contraction we fail. + * Uses the temporary utility buffer to return the modified start and end. + * @param start offset of potential match, to be modified if necessary + * @param end offset of potential match, to be modified if necessary + * @return true if match passes the contraction test, false otherwise. + */ + private boolean checkPreviousCanonicalContractionMatch(int start, int end) + { + int temp = end; + // This part checks if either ends of the match contains potential + // contraction. If so we'll have to iterate through them + char echar = 0; + char schar = 0; + if (end < m_textLimitOffset_) { + targetText.setIndex(end); + echar = targetText.current(); + } + if (start + 1 < m_textLimitOffset_) { + targetText.setIndex(start + 1); + schar = targetText.current(); + } + if (m_collator_.isUnsafe(echar) || m_collator_.isUnsafe(schar)) { + int expansion = m_colEIter_.m_CEBufferSize_ + - m_colEIter_.m_CEBufferOffset_; + boolean hasExpansion = expansion > 0; + m_colEIter_.setExactOffset(end); + while (expansion > 0) { + // getting rid of the redundant ce + // since forward contraction/expansion may have extra ces + // if we are in the normalization buffer, hasAccentsBeforeMatch + // would have taken care of it. + // E.g. the character \u01FA will have an expansion of 3, but + // if we are only looking for A ring A\u030A, we'll have to + // skip the last ce in the expansion buffer + m_colEIter_.previous(); + if (m_colEIter_.getOffset() != temp) { + end = temp; + temp = m_colEIter_.getOffset(); + } + expansion --; + } + + int count = m_pattern_.m_CELength_; + while (count > 0) { + int ce = getCE(m_colEIter_.previous()); + // status checked below, note that if status is a failure + // previous() returns NULLORDER + if (ce == CollationElementIterator.IGNORABLE) { + continue; + } + if (hasExpansion && count == 0 + && m_colEIter_.getOffset() != temp) { + end = temp; + temp = m_colEIter_.getOffset(); + } + if (count == m_pattern_.m_CELength_ + && ce != m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]) { + // accents may have extra starting ces, this occurs when a + // pure accent pattern is matched without rearrangement + int expected = m_pattern_.m_CE_[m_pattern_.m_CELength_ - 1]; + targetText.setIndex(end); + if (UTF16.isTrailSurrogate(targetText.previous())) { + if (targetText.getIndex() > m_textBeginOffset_ && + !UTF16.isLeadSurrogate(targetText.previous())) { + targetText.next(); + } + } + end = targetText.getIndex(); + if ((getFCD(targetText, end) & LAST_BYTE_MASK_) != 0) { + ce = getCE(m_colEIter_.previous()); + while (ce != expected + && ce != CollationElementIterator.NULLORDER + && m_colEIter_.getOffset() <= start) { + ce = getCE(m_colEIter_.previous()); + } + } + } + if (ce != m_pattern_.m_CE_[count - 1]) { + start --; + start = getPreviousBaseOffset(start); + m_utilBuffer_[0] = start; + m_utilBuffer_[1] = end; + return false; + } + count --; + } + } + m_utilBuffer_[0] = start; + m_utilBuffer_[1] = end; + return true; + } + + /** + * Checks and sets the match information if found. + * Checks + *- the current match does not repeat the last match + *
- boundaries are correct + *
- exact matches has no extra accents + *
- identical matches + *
+ *
+ * Otherwise the offset will be shifted to the next character. + * Uses the temporary utility buffer for storing the modified textoffset. + * @param textoffset offset in the collation element text. the returned + * value will be the truncated start offset of the match or the + * new start search offset. + * @return true if the match is valid, false otherwise + */ + private boolean checkPreviousCanonicalMatch(int textoffset) + { + // to ensure that the start and ends are not composite characters + // if we have a canonical accent match + if (m_pattern_.m_hasSuffixAccents_ + && m_canonicalSuffixAccents_.length() != 0 + || m_pattern_.m_hasPrefixAccents_ + && m_canonicalPrefixAccents_.length() != 0) { + m_matchedIndex_ = textoffset; + matchLength = getNextBaseOffset(m_colEIter_.getOffset()) + - textoffset; + return true; + } + + int end = m_colEIter_.getOffset(); + if (!checkPreviousCanonicalContractionMatch(textoffset, end)) { + // storing the modified textoffset + return false; + } + textoffset = m_utilBuffer_[0]; + end = m_utilBuffer_[1]; + end = getNextBaseOffset(end); + // this totally matches, however we need to check if it is repeating + if (checkRepeatedMatch(textoffset, end) + || !isBreakUnit(textoffset, end) + || !checkIdentical(textoffset, end)) { + textoffset --; + textoffset = getPreviousBaseOffset(textoffset); + m_utilBuffer_[0] = textoffset; + return false; + } + + m_matchedIndex_ = textoffset; + matchLength = end - textoffset; + return true; + } + + /** + * Method that does the next exact match + * @param start the offset to start shifting from and performing the + * next exact match + */ + private void handleNextExact(int start) + { + int textoffset = shiftForward(start, + CollationElementIterator.NULLORDER, + m_pattern_.m_CELength_); + int targetce = CollationElementIterator.IGNORABLE; + while (textoffset <= m_textLimitOffset_) { + m_colEIter_.setExactOffset(textoffset); + int patternceindex = m_pattern_.m_CELength_ - 1; + boolean found = false; + int lastce = CollationElementIterator.NULLORDER; + + while (true) { + // finding the last pattern ce match, imagine composite + // characters. for example: search for pattern A in text \u00C0 + // we'll have to skip \u0300 the grave first before we get to A + targetce = m_colEIter_.previous(); + if (targetce == CollationElementIterator.NULLORDER) { + found = false; + break; + } + targetce = getCE(targetce); + if (targetce == CollationElementIterator.IGNORABLE && + m_colEIter_.isInBuffer()) { + // this is for the text \u0315\u0300 that requires + // normalization and pattern \u0300, where \u0315 is ignorable + continue; + } + if (lastce == CollationElementIterator.NULLORDER + || lastce == CollationElementIterator.IGNORABLE) { + lastce = targetce; + } + if (targetce == m_pattern_.m_CE_[patternceindex]) { + // the first ce can be a contraction + found = true; + break; + } + if (m_colEIter_.m_CEBufferOffset_ <= 0) { + found = false; + break; + } + } + + targetce = lastce; + + while (found && patternceindex > 0) { + targetce = m_colEIter_.previous(); + if (targetce == CollationElementIterator.NULLORDER) { + found = false; + break; + } + targetce = getCE(targetce); + if (targetce == CollationElementIterator.IGNORABLE) { + continue; + } + + patternceindex --; + found = found && targetce == m_pattern_.m_CE_[patternceindex]; + } + + if (!found) { + textoffset = shiftForward(textoffset, targetce, + patternceindex); + // status checked at loop. + patternceindex = m_pattern_.m_CELength_; + continue; + } + + if (checkNextExactMatch(textoffset)) { + // status checked in ucol_setOffset + return; + } + textoffset = m_utilBuffer_[0]; + } + setMatchNotFound(); + } + + /** + * Method that does the next canonical match + * @param start the offset to start shifting from and performing the + * next canonical match + */ + private void handleNextCanonical(int start) + { + boolean hasPatternAccents = + m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_; + + // shifting it check for setting offset + // if setOffset is called previously or there was no previous match, we + // leave the offset as it is. + int textoffset = shiftForward(start, CollationElementIterator.NULLORDER, + m_pattern_.m_CELength_); + m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length()); + m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length()); + int targetce = CollationElementIterator.IGNORABLE; + + while (textoffset <= m_textLimitOffset_) + { + m_colEIter_.setExactOffset(textoffset); + int patternceindex = m_pattern_.m_CELength_ - 1; + boolean found = false; + int lastce = CollationElementIterator.NULLORDER; + + while (true) { + // finding the last pattern ce match, imagine composite characters + // for example: search for pattern A in text \u00C0 + // we'll have to skip \u0300 the grave first before we get to A + targetce = m_colEIter_.previous(); + if (targetce == CollationElementIterator.NULLORDER) { + found = false; + break; + } + targetce = getCE(targetce); + if (lastce == CollationElementIterator.NULLORDER + || lastce == CollationElementIterator.IGNORABLE) { + lastce = targetce; + } + if (targetce == m_pattern_.m_CE_[patternceindex]) { + // the first ce can be a contraction + found = true; + break; + } + if (m_colEIter_.m_CEBufferOffset_ <= 0) { + found = false; + break; + } + } + targetce = lastce; + + while (found && patternceindex > 0) { + targetce = m_colEIter_.previous(); + if (targetce == CollationElementIterator.NULLORDER) { + found = false; + break; + } + targetce = getCE(targetce); + if (targetce == CollationElementIterator.IGNORABLE) { + continue; + } + + patternceindex --; + found = found && targetce == m_pattern_.m_CE_[patternceindex]; + } + + // initializing the rearranged accent array + if (hasPatternAccents && !found) { + found = doNextCanonicalMatch(textoffset); + } + + if (!found) { + textoffset = shiftForward(textoffset, targetce, patternceindex); + // status checked at loop + patternceindex = m_pattern_.m_CELength_; + continue; + } + + if (checkNextCanonicalMatch(textoffset)) { + return; + } + textoffset = m_utilBuffer_[0]; + } + setMatchNotFound(); + } + + /** + * Method that does the previous exact match + * @param start the offset to start shifting from and performing the + * previous exact match + */ + private void handlePreviousExact(int start) + { + int textoffset = reverseShift(start, CollationElementIterator.NULLORDER, + m_pattern_.m_CELength_); + while (textoffset >= m_textBeginOffset_) + { + m_colEIter_.setExactOffset(textoffset); + int patternceindex = 1; + int targetce = CollationElementIterator.IGNORABLE; + boolean found = false; + int firstce = CollationElementIterator.NULLORDER; + + while (true) { + // finding the first pattern ce match, imagine composite + // characters. for example: search for pattern \u0300 in text + // \u00C0, we'll have to skip A first before we get to + // \u0300 the grave accent + targetce = m_colEIter_.next(); + if (targetce == CollationElementIterator.NULLORDER) { + found = false; + break; + } + targetce = getCE(targetce); + if (firstce == CollationElementIterator.NULLORDER + || firstce == CollationElementIterator.IGNORABLE) { + firstce = targetce; + } + if (targetce == CollationElementIterator.IGNORABLE) { + continue; + } + if (targetce == m_pattern_.m_CE_[0]) { + found = true; + break; + } + if (m_colEIter_.m_CEBufferOffset_ == -1 + || m_colEIter_.m_CEBufferOffset_ + == m_colEIter_.m_CEBufferSize_) { + // checking for accents in composite character + found = false; + break; + } + } + + targetce = firstce; + + while (found && patternceindex < m_pattern_.m_CELength_) { + targetce = m_colEIter_.next(); + if (targetce == CollationElementIterator.NULLORDER) { + found = false; + break; + } + targetce = getCE(targetce); + if (targetce == CollationElementIterator.IGNORABLE) { + continue; + } + + found = found && targetce == m_pattern_.m_CE_[patternceindex]; + patternceindex ++; + } + + if (!found) { + textoffset = reverseShift(textoffset, targetce, patternceindex); + patternceindex = 0; + continue; + } + + if (checkPreviousExactMatch(textoffset)) { + return; + } + textoffset = m_utilBuffer_[0]; + } + setMatchNotFound(); + } + + /** + * Method that does the previous canonical match + * @param start the offset to start shifting from and performing the + * previous canonical match + */ + private void handlePreviousCanonical(int start) + { + boolean hasPatternAccents = + m_pattern_.m_hasSuffixAccents_ || m_pattern_.m_hasPrefixAccents_; + + // shifting it check for setting offset + // if setOffset is called previously or there was no previous match, we + // leave the offset as it is. + int textoffset = reverseShift(start, CollationElementIterator.NULLORDER, + m_pattern_.m_CELength_); + m_canonicalPrefixAccents_.delete(0, m_canonicalPrefixAccents_.length()); + m_canonicalSuffixAccents_.delete(0, m_canonicalSuffixAccents_.length()); + + while (textoffset >= m_textBeginOffset_) + { + m_colEIter_.setExactOffset(textoffset); + int patternceindex = 1; + int targetce = CollationElementIterator.IGNORABLE; + boolean found = false; + int firstce = CollationElementIterator.NULLORDER; + + while (true) { + // finding the first pattern ce match, imagine composite + // characters. for example: search for pattern \u0300 in text + // \u00C0, we'll have to skip A first before we get to + // \u0300 the grave accent + targetce = m_colEIter_.next(); + if (targetce == CollationElementIterator.NULLORDER) { + found = false; + break; + } + targetce = getCE(targetce); + if (firstce == CollationElementIterator.NULLORDER + || firstce == CollationElementIterator.IGNORABLE) { + firstce = targetce; + } + + if (targetce == m_pattern_.m_CE_[0]) { + // the first ce can be a contraction + found = true; + break; + } + if (m_colEIter_.m_CEBufferOffset_ == -1 + || m_colEIter_.m_CEBufferOffset_ + == m_colEIter_.m_CEBufferSize_) { + // checking for accents in composite character + found = false; + break; + } + } + + targetce = firstce; + + while (found && patternceindex < m_pattern_.m_CELength_) { + targetce = m_colEIter_.next(); + if (targetce == CollationElementIterator.NULLORDER) { + found = false; + break; + } + targetce = getCE(targetce); + if (targetce == CollationElementIterator.IGNORABLE) { + continue; + } + + found = found && targetce == m_pattern_.m_CE_[patternceindex]; + patternceindex ++; + } + + // initializing the rearranged accent array + if (hasPatternAccents && !found) { + found = doPreviousCanonicalMatch(textoffset); + } + + if (!found) { + textoffset = reverseShift(textoffset, targetce, patternceindex); + patternceindex = 0; + continue; + } + + if (checkPreviousCanonicalMatch(textoffset)) { + return; + } + textoffset = m_utilBuffer_[0]; + } + setMatchNotFound(); + } + + /** + * Gets a substring out of a CharacterIterator + * @param text CharacterIterator + * @param start start offset + * @param length of substring + * @return substring from text starting at start and length length + */ + private static final String getString(CharacterIterator text, int start, + int length) + { + StringBuffer result = new StringBuffer(length); + int offset = text.getIndex(); + text.setIndex(start); + for (int i = 0; i < length; i ++) { + result.append(text.current()); + text.next(); + } + text.setIndex(offset); + return result.toString(); + } + + /** + * Getting the mask for collation strength + * @param strength collation strength + * @return collation element mask + */ + private static final int getMask(int strength) + { + switch (strength) + { + case Collator.PRIMARY: + return RuleBasedCollator.CE_PRIMARY_MASK_; + case Collator.SECONDARY: + return RuleBasedCollator.CE_SECONDARY_MASK_ + | RuleBasedCollator.CE_PRIMARY_MASK_; + default: + return RuleBasedCollator.CE_TERTIARY_MASK_ + | RuleBasedCollator.CE_SECONDARY_MASK_ + | RuleBasedCollator.CE_PRIMARY_MASK_; + } + } + + /** + * Sets match not found + */ + private void setMatchNotFound() + { + // this method resets the match result regardless of the error status. + m_matchedIndex_ = DONE; + setMatchLength(0); } -}; +}- the potential match does not repeat the previous match + *
- boundaries are correct + *
- potential match does not end in the middle of a contraction + *
- identical matches + *