initial collation commits

X-SVN-Rev: 8615
This commit is contained in:
Syn Wee Quek 2002-05-14 16:48:49 +00:00
parent fa460c1481
commit 44672d459f
7 changed files with 6543 additions and 44 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/UCharacterIterator.java,v $
* $Date: 2002/04/03 00:00:00 $
* $Revision: 1.4 $
* $Date: 2002/05/14 16:48:49 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -41,27 +41,62 @@ public final class UCharacterIterator implements CharacterIterator
// public constructor ------------------------------------------------------
/**
* Public constructor
* Public constructor.
* By default the iteration range will be from 0 to the end of the text.
* @param replacable text which the iterator will be based on
*/
public UCharacterIterator(Replaceable replaceable)
{
m_replaceable_ = replaceable;
m_index_ = 0;
m_length_ = replaceable.length();
m_start_ = 0;
m_limit_ = replaceable.length();
}
/**
* Public constructor
* By default the iteration range will be from 0 to the end of the text.
* @param str text which the iterator will be based on
*/
public UCharacterIterator(String str)
{
m_replaceable_ = new ReplaceableString(str);
m_index_ = 0;
m_length_ = m_replaceable_.length();
m_start_ = 0;
m_limit_ = m_replaceable_.length();
}
/**
* Constructs an iterator over the given range of the given string.
* @param text text to be iterated over
* @param start offset of the first character to iterate
* @param limit offset of the character following the last character to
* iterate
*/
public UCharacterIterator(String str, int start, int limit)
{
m_replaceable_ = new ReplaceableString(str);
m_start_ = start;
m_limit_ = limit;
m_index_ = m_start_;
}
/**
* Constructs an iterator over the given range of the given replaceable
* string.
* @param text text to be iterated over
* @param start offset of the first character to iterate
* @param limit offset of the character following the last character to
* iterate
*/
public UCharacterIterator(Replaceable replaceable, int start, int limit)
{
m_replaceable_ = replaceable;
m_start_ = start;
m_limit_ = limit;
m_index_ = m_start_;
}
// public methods ----------------------------------------------------------
/**
@ -87,7 +122,7 @@ public final class UCharacterIterator implements CharacterIterator
*/
public char current()
{
if (m_index_ >= 0 && m_index_ < m_length_) {
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
return m_replaceable_.charAt(m_index_);
}
return DONE;
@ -99,7 +134,7 @@ public final class UCharacterIterator implements CharacterIterator
*/
public int currentCodePoint()
{
if (m_index_ >= 0 && m_index_ < m_length_) {
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
return m_replaceable_.char32At(m_index_);
}
return DONE_CODEPOINT;
@ -111,26 +146,28 @@ public final class UCharacterIterator implements CharacterIterator
*/
public char first()
{
m_index_ = 0;
m_index_ = m_start_;
return current();
}
/**
* Returns the start of the text.
* @return 0
* Returns the start of the text to iterate.
* @return by default this method will return 0, unless a range for
* iteration had been specified during construction.
*/
public int getBeginIndex()
{
return 0;
return m_start_;
}
/**
* Returns the length of the text
* @return length of the text
* Returns the limit offset of the text to iterate
* @return by default this method returns the length of the text, unless a
* range for iteration had been specified during construction.
*/
public int getEndIndex()
{
return m_length_;
return m_limit_;
}
/**
@ -143,31 +180,31 @@ public final class UCharacterIterator implements CharacterIterator
}
/**
* Gets the last UTF16 character from the text and shifts the index to the
* end of the text accordingly.
* @return the last UTF16 character
* Gets the last UTF16 iterateable character from the text and shifts the
* index to the end of the text accordingly.
* @return the last UTF16 iterateable character
*/
public char last()
{
if (m_length_ != 0) {
m_index_ = m_length_ - 1;
if (m_limit_ != m_start_) {
m_index_ = m_limit_ - 1;
return m_replaceable_.charAt(m_index_);
}
m_index_ = m_length_;
m_index_ = m_limit_;
return DONE;
}
/**
* Returns next UTF16 character and increments the iterator's index by 1.
* If the resulting index is greater or equal to the text length, the
* index is reset to the text length and a value of DONE_CODEPOINT is
* If the resulting index is greater or equal to the iteration limit, the
* index is reset to the text iteration limit and a value of DONE_CODEPOINT is
* returned.
* @return next UTF16 character in text or DONE if the new index is off the
* end of the text range.
* end of the text iteration limit.
*/
public char next()
{
if (m_index_ < m_length_) {
if (m_index_ < m_limit_) {
char result = m_replaceable_.charAt(m_index_);
m_index_ ++;
return result;
@ -182,20 +219,20 @@ public final class UCharacterIterator implements CharacterIterator
* with surrogate pairs intermixed. If the index of a leading or trailing
* code unit of a surrogate pair is given, return the code point after the
* surrogate pair.
* If the resulting index is greater or equal to the text length, the
* current index is reset to the text length and a value of DONE_CODEPOINT
* is returned.
* If the resulting index is greater or equal to the text iterateable limit,
* the current index is reset to the text iterateable limit and a value of
* DONE_CODEPOINT is returned.
* @return next codepoint in text or DONE_CODEPOINT if the new index is off the
* end of the text range.
* end of the text iterateable limit.
*/
public int nextCodePoint()
{
if (m_index_ < m_length_) {
if (m_index_ < m_limit_) {
char ch = m_replaceable_.charAt(m_index_);
m_index_ ++;
if (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
ch <= UTF16.LEAD_SURROGATE_MAX_VALUE &&
m_index_ < m_length_) {
m_index_ < m_limit_) {
char trail = m_replaceable_.charAt(m_index_);
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
@ -212,14 +249,15 @@ public final class UCharacterIterator implements CharacterIterator
/**
* Returns previous UTF16 character and decrements the iterator's index by
* 1.
* If the resulting index is less than 0, the index is reset to 0 and a
* value of DONE_CODEPOINT is returned.
* If the resulting index is less than the text iterateable limit, the
* index is reset to the start of the text iteration and a value of
* DONE_CODEPOINT is returned.
* @return next UTF16 character in text or DONE if the new index is off the
* start of the text range.
* start of the text iteration range.
*/
public char previous()
{
if (m_index_ > 0) {
if (m_index_ > m_start_) {
m_index_ --;
return m_replaceable_.charAt(m_index_);
}
@ -233,19 +271,20 @@ public final class UCharacterIterator implements CharacterIterator
* with surrogate pairs intermixed. If the index of a leading or trailing
* code unit of a surrogate pair is given, return the code point before the
* surrogate pair.
* If the resulting index is less than 0, the current index is reset to 0
* and a value of DONE_CODEPOINT is returned.
* If the resulting index is less than the text iterateable range, the
* current index is reset to the start of the range and a value of
* DONE_CODEPOINT is returned.
* @return previous codepoint in text or DONE_CODEPOINT if the new index is
* off the start of the text range.
* off the start of the text iteration range.
*/
public int previousCodePoint()
{
if (m_index_ > 0) {
if (m_index_ > m_start_) {
m_index_ --;
char ch = m_replaceable_.charAt(m_index_);
if (ch >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
ch <= UTF16.TRAIL_SURROGATE_MAX_VALUE &&
m_index_ > 0) {
m_index_ > m_start_) {
char lead = m_replaceable_.charAt(m_index_);
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
@ -267,12 +306,11 @@ public final class UCharacterIterator implements CharacterIterator
* @exception IllegalArgumentException is thrown if an invalid index is
* supplied. i.e. index is out of bounds.
* @return the character at the specified index or DONE if the specified
* index is equal to the end of the text.
* index is equal to the limit of the text iteration range.
*/
public char setIndex(int index)
{
int length = m_replaceable_.length();
if (index < 0 || index > length) {
if (index < m_start_ || index > m_limit_) {
throw new IllegalArgumentException("Index index out of bounds");
}
m_index_ = index;
@ -290,7 +328,12 @@ public final class UCharacterIterator implements CharacterIterator
*/
private int m_index_;
/**
* Replaceable text length
* Start offset of iterateable range, by default this is 0
*/
private int m_length_;
private int m_start_;
/**
* Limit offset of iterateable range, by default this is the length of the
* string
*/
private int m_limit_;
}

View file

@ -0,0 +1,382 @@
/**
*******************************************************************************
* Copyright (C) 1996-2002, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/BOSCU.java,v $
* $Date: 2002/05/14 16:48:48 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.UCharacterIterator;
/**
* <p>Binary Ordered Compression Scheme for Unicode</p>
*
* <p>Specific application:<br>
* Encode a Unicode string for the identical level of a sort key.<br>
* Restrictions:
* <ul>
* <li> byte stream (unsigned 8-bit bytes)
* <li> lexical order of the identical-level run must be the same as code
* point order for the string
* <li> avoid byte values 0, 1, 2
* </ul>
* </p>
*
* <p>Method: Slope Detection<br>
* Remember the previous code point (initial 0).
* For each cp in the string, encode the difference to the previous one.
* </p>
* <p>With a compact encoding of differences, this yields good results for
* small scripts and UTF-like results otherwise.
* </p>
* <p>Encoding of differences:<br>
* <ul>
* <li>Similar to a UTF, encoding the length of the byte sequence in the lead
* bytes.
* <li> Does not need to be friendly for decoding or random access
* (trail byte values may overlap with lead/single byte values).
* <li> The signedness must be encoded as the most significant part.
* </ul>
* </p>
* <p>We encode differences with few bytes if their absolute values are small.
* For correct ordering, we must treat the entire value range -10ffff..+10ffff
* in ascending order, which forbids encoding the sign and the absolute value
* separately.
* Instead, we split the lead byte range in the middle and encode non-negative
* values going up and negative values going down.
* </p>
* <p>For very small absolute values, the difference is added to a middle byte
* value for single-byte encoded differences.
* For somewhat larger absolute values, the difference is divided by the number
* of byte values available, the modulo is used for one trail byte, and the
* remainder is added to a lead byte avoiding the single-byte range.
* For large absolute values, the difference is similarly encoded in three
* bytes.
* </p>
* <p>This encoding does not use byte values 0, 1, 2, but uses all other byte
* values for lead/single bytes so that the middle range of single bytes is as
* large as possible.
* </p>
* <p>Note that the lead byte ranges overlap some, but that the sequences as a
* whole are well ordered. I.e., even if the lead byte is the same for
* sequences of different lengths, the trail bytes establish correct order.
* It would be possible to encode slightly larger ranges for each length (>1)
* by subtracting the lower bound of the range. However, that would also slow
* down the calculation.
* </p>
* <p>For the actual string encoding, an optimization moves the previous code
* point value to the middle of its Unicode script block to minimize the
* differences in same-script text runs.
* </p>
* @author Syn Wee Quek
* @since release 2.2, May 3rd 2002
* @draft 2.2
*/
public class BOSCU
{
// public constructors --------------------------------------------------
// public methods -------------------------------------------------------
/**
* <p>Encode the code points of a string as a sequence of byte-encoded
* differences (slope detection), preserving lexical order.</p>
* <p>Optimize the difference-taking for runs of Unicode text within
* small scripts:<br>
* Most small scripts are allocated within aligned 128-blocks of Unicode
* code points. Lexical order is preserved if "prev" is always moved
* into the middle of such a block.</p>
* <p>Additionally, "prev" is moved from anywhere in the Unihan area into
* the middle of that area.</p>
* <p>Note that the identical-level run in a sort key is generated from
* NFD text - there are never Hangul characters included.</p>
* @param source text source
* @param buffer output buffer
* @param offset to start writing to
* @return end offset where the writing stop
*/
public static int writeIdenticalLevelRun(String source, byte buffer[],
int offset)
{
int prev = 0;
UCharacterIterator iterator = new UCharacterIterator(source);
int codepoint = iterator.nextCodePoint();
while (codepoint != UCharacterIterator.DONE_CODEPOINT) {
if (prev < 0x4e00 || prev >= 0xa000) {
prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
}
else {
// Unihan U+4e00..U+9fa5:
// double-bytes down from the upper end
prev = 0x9fff - SLOPE_REACH_POS_2_;
}
offset = writeDiff(codepoint - prev, buffer, offset);
prev = codepoint;
codepoint = iterator.nextCodePoint();
}
return offset;
}
/**
* How many bytes would writeIdenticalLevelRun() write?
* @param source text source string
* @return the length of the BOSCU result
*/
public static int lengthOfIdenticalLevelRun(String source)
{
int prev = 0;
int result = 0;
UCharacterIterator iterator = new UCharacterIterator(source);
int codepoint = iterator.nextCodePoint();
while (codepoint != UCharacterIterator.DONE_CODEPOINT) {
if (prev < 0x4e00 || prev >= 0xa000) {
prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
}
else {
// Unihan U+4e00..U+9fa5:
// double-bytes down from the upper end
prev = 0x9fff - SLOPE_REACH_POS_2_;
}
codepoint = iterator.nextCodePoint();
result += lengthOfDiff(codepoint - prev);
prev = codepoint;
}
return result;
}
// public setter methods -------------------------------------------------
// public getter methods ------------------------------------------------
// public other methods -------------------------------------------------
// protected constructor ------------------------------------------------
// protected data members ------------------------------------------------
// protected methods -----------------------------------------------------
// private data members --------------------------------------------------
/**
* Do not use byte values 0, 1, 2 because they are separators in sort keys.
*/
private static final int SLOPE_MIN_ = 3;
private static final int SLOPE_MAX_ = 0xff;
private static final int SLOPE_MIDDLE_ = 0x81;
private static final int SLOPE_TAIL_COUNT_ = SLOPE_MAX_ - SLOPE_MIN_ + 1;
private static final int SLOPE_MAX_BYTES_ = 4;
/**
* Number of lead bytes:
* 1 middle byte for 0
* 2*80=160 single bytes for !=0
* 2*42=84 for double-byte values
* 2*3=6 for 3-byte values
* 2*1=2 for 4-byte values
*
* The sum must be <=SLOPE_TAIL_COUNT.
*
* Why these numbers?
* - There should be >=128 single-byte values to cover 128-blocks
* with small scripts.
* - There should be >=20902 single/double-byte values to cover Unihan.
* - It helps CJK Extension B some if there are 3-byte values that cover
* the distance between them and Unihan.
* This also helps to jump among distant places in the BMP.
* - Four-byte values are necessary to cover the rest of Unicode.
*
* Symmetrical lead byte counts are for convenience.
* With an equal distribution of even and odd differences there is also
* no advantage to asymmetrical lead byte counts.
*/
private static final int SLOPE_SINGLE_ = 80;
private static final int SLOPE_LEAD_2_ = 42;
private static final int SLOPE_LEAD_3_ = 3;
private static final int SLOPE_LEAD_4_ = 1;
/**
* The difference value range for single-byters.
*/
private static final int SLOPE_REACH_POS_1_ = SLOPE_SINGLE_;
private static final int SLOPE_REACH_NEG_1_ = (-SLOPE_SINGLE_);
/**
* The difference value range for double-byters.
*/
private static final int SLOPE_REACH_POS_2_ =
SLOPE_LEAD_2_ * SLOPE_TAIL_COUNT_ + SLOPE_LEAD_2_ - 1;
private static final int SLOPE_REACH_NEG_2_ = (-SLOPE_REACH_POS_2_ - 1);
/**
* The difference value range for 3-byters.
*/
private static final int SLOPE_REACH_POS_3_ = SLOPE_LEAD_3_
* SLOPE_TAIL_COUNT_
* SLOPE_TAIL_COUNT_
+ (SLOPE_LEAD_3_ - 1)
* SLOPE_TAIL_COUNT_ +
(SLOPE_TAIL_COUNT_ - 1);
private static final int SLOPE_REACH_NEG_3_ = (-SLOPE_REACH_POS_3_ - 1);
/**
* The lead byte start values.
*/
private static final int SLOPE_START_POS_2_ = SLOPE_MIDDLE_
+ SLOPE_SINGLE_ + 1;
private static final int SLOPE_START_POS_3_ = SLOPE_START_POS_2_
+ SLOPE_LEAD_2_;
private static final int SLOPE_START_NEG_2_ = SLOPE_MIDDLE_ +
SLOPE_REACH_NEG_1_;
private static final int SLOPE_START_NEG_3_ = SLOPE_START_NEG_2_
- SLOPE_LEAD_2_;
// private constructor ---------------------------------------------------
/**
* Constructor private to prevent initialization
*/
private BOSCU()
{
}
// private methods -------------------------------------------------------
/**
* Integer division and modulo with negative numerators
* yields negative modulo results and quotients that are one more than
* what we need here.
* @param number which operations are to be performed on
* @param factor the factor to use for division
* @return (result of division) << 32 | modulo
*/
private static final long getNegDivMod(int number, int factor)
{
int modulo = number % factor;
long result = number / factor;
if (modulo < 0) {
-- result;
modulo += factor;
}
return (result << 32) | modulo;
}
/**
* Encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes,
* preserving lexical order
* @param diff
* @param buffer byte buffer to append to
* @param offset to the byte buffer to start appending
* @return end offset where the appending stops
*/
private static final int writeDiff(int diff, byte buffer[], int offset)
{
if (diff >= SLOPE_REACH_NEG_1_) {
if (diff <= SLOPE_REACH_POS_1_) {
buffer[offset ++] = (byte)(SLOPE_MIDDLE_ + diff);
}
else if (diff <= SLOPE_REACH_POS_2_) {
buffer[offset ++] = (byte)(SLOPE_START_POS_2_
+ (diff / SLOPE_TAIL_COUNT_));
buffer[offset ++] = (byte)(SLOPE_MIN_ +
(diff % SLOPE_TAIL_COUNT_));
}
else if (diff <= SLOPE_REACH_POS_3_) {
buffer[offset + 2] = (byte)(SLOPE_MIN_
+ (diff % SLOPE_TAIL_COUNT_));
diff /= SLOPE_TAIL_COUNT_;
buffer[offset + 1] = (byte)(SLOPE_MIN_
+ (diff % SLOPE_TAIL_COUNT_));
buffer[offset] = (byte)(SLOPE_START_POS_3_
+ (diff / SLOPE_TAIL_COUNT_));
offset += 3;
}
else {
buffer[offset + 3] = (byte)(SLOPE_MIN_
+ diff % SLOPE_TAIL_COUNT_);
diff /= SLOPE_TAIL_COUNT_;
buffer[offset] = (byte)(SLOPE_MIN_
+ diff % SLOPE_TAIL_COUNT_);
diff /= SLOPE_TAIL_COUNT_;
buffer[offset + 1] = (byte)(SLOPE_MIN_
+ diff % SLOPE_TAIL_COUNT_);
buffer[offset] = (byte)SLOPE_MAX_;
offset += 4;
}
}
else {
long division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
int modulo = (int)division;
if (diff >= SLOPE_REACH_NEG_2_) {
diff = (int)(division >> 32);
buffer[offset ++] = (byte)(SLOPE_START_NEG_2_ + diff);
buffer[offset ++] = (byte)(SLOPE_MIN_ + modulo);
}
else if (diff >= SLOPE_REACH_NEG_3_) {
buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
diff = (int)(division >> 32);
division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
modulo = (int)division;
diff = (int)(division >> 32);
buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
buffer[offset] = (byte)(SLOPE_START_NEG_3_ + diff);
offset += 3;
}
else {
buffer[offset + 3] = (byte)(SLOPE_MIN_ + modulo);
diff = (int)(division >> 32);
division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
modulo = (int)division;
diff = (int)(division >> 32);
buffer[offset + 2] = (byte)(SLOPE_MIN_ + modulo);
division = getNegDivMod(diff, SLOPE_TAIL_COUNT_);
modulo = (int)division;
buffer[offset + 1] = (byte)(SLOPE_MIN_ + modulo);
buffer[offset] = SLOPE_MIN_;
offset += 4;
}
}
return offset;
}
/**
* How many bytes would writeDiff() write?
* @param diff
*/
private static final int lengthOfDiff(int diff)
{
if (diff >= SLOPE_REACH_NEG_1_) {
if (diff <= SLOPE_REACH_POS_1_) {
return 1;
}
else if (diff <= SLOPE_REACH_POS_2_) {
return 2;
}
else if(diff <= SLOPE_REACH_POS_3_) {
return 3;
}
else {
return 4;
}
}
else {
if (diff >= SLOPE_REACH_NEG_2_) {
return 2;
}
else if (diff >= SLOPE_REACH_NEG_3_) {
return 3;
}
else {
return 4;
}
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,260 @@
/**
*******************************************************************************
* Copyright (C) 1996-2002, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationKey.java,v $
* $Date: 2002/05/14 16:48:49 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import java.util.Arrays;
/**
* <p>A <code>CollationKey</code> represents a <code>String</code> under the
* rules of a specific <code>Collator</code> object. Comparing two
* <code>CollationKey</code>s returns the relative order of the
* <code>String</code>s they represent. Using <code>CollationKey</code>s to
* compare <code>String</code>s is generally faster than using
* <code>Collator.compare</code>. Thus, when the <code>String</code>s must be
* compared multiple times, for example when sorting a list of
* <code>String</code>s. It's more efficient to use <code>CollationKey</code>s.
* </p>
* <p>You can not create <code>CollationKey</code>s directly. Rather, generate
* them by calling <code>Collator.getCollationKey(String)</code>. You can only
* compare <code>CollationKey</code>s generated from the same
* <code>Collator</code> object.</p>
* <p>Generating a <code>CollationKey</code> for a <code>String</code>
* involves examining the entire <code>String</code> and converting it to
* series of bits that can be compared bitwise. This allows fast comparisons
* once the keys are generated. The cost of generating keys is recouped in
* faster comparisons when <code>String</code>s need to be compared many
* times. On the other hand, the result of a comparison is often determined by
* the first couple of characters of each <code>String</code>.
* <code>Collator.compare(String, String)</code> examines only as many characters as it needs
* which allows it to be faster when doing single comparisons.</p>
* <p>The following example shows how <code>CollationKey</code>s might be used
* to sort a list of <code>String</code>s.</p>
* <blockquote>
* <pre>
* // Create an array of CollationKeys for the Strings to be sorted.
* Collator myCollator = Collator.getInstance();
* CollationKey[] keys = new CollationKey[3];
* keys[0] = myCollator.getCollationKey("Tom");
* keys[1] = myCollator.getCollationKey("Dick");
* keys[2] = myCollator.getCollationKey("Harry");
* sort( keys );
* <br>
* //...
* <br>
* // Inside body of sort routine, compare keys this way
* if( keys[i].compareTo( keys[j] ) > 0 )
* // swap keys[i] and keys[j]
* <br>
* //...
* <br>
* // Finally, when we've returned from sort.
* System.out.println( keys[0].getSourceString() );
* System.out.println( keys[1].getSourceString() );
* System.out.println( keys[2].getSourceString() );
* </pre>
* </blockquote>
*
* @see Collator
* @see RuleBasedCollator
* @author Syn Wee Quek
* @since release 2.2, April 18 2002
* @draft 2.2
*/
public final class CollationKey implements Comparable
{
// public methods -------------------------------------------------------
// public getters -------------------------------------------------------
/**
* Returns the String that this CollationKey represents.
* @return source string that this CollationKey represents
* @draft 2.2
*/
public String getSourceString()
{
return m_source_;
}
/**
* <p>Duplicates and returns the value of this CollationKey as a sequence
* of big-endian bytes.</p>
* <p>If two CollationKeys could be legitimately compared, then one could
* compare the byte arrays of each to obtain the same result.</p>
* @return CollationKey value in a sequence of big-endian byte bytes.
* @draft 2.2
*/
public byte[] toByteArray()
{
int length = 0;
while (true) {
if (m_key_[length] == 0) {
break;
}
length ++;
}
length ++;
byte result[] = new byte[length];
System.arraycopy(m_key_, 0, result, 0, length);
return result;
}
// public other methods -------------------------------------------------
/**
* <p>Compare this CollationKey to the target CollationKey. The collation
* rules of the Collator object which created these keys are applied.</p>
* <p><strong>Note:</strong> CollationKeys created by different Collators
* can not be compared.</p>
* @param target target CollationKey
* @return an integer value, if value is less than zero this CollationKey
* is less than than target, if value is zero if they are equal
* and value is greater than zero if this CollationKey is greater
* than target.
* @see Collator#compare(String, String)
* @draft 2.2
*/
public int compareTo(CollationKey target)
{
int i = 0;
while (m_key_[i] != 0 && target.m_key_[i] != 0) {
int key = m_key_[i] & 0xFF;
int targetkey = target.m_key_[i] & 0xFF;
if (key < targetkey) {
return -1;
}
if (targetkey < key) {
return 1;
}
i ++;
}
// last comparison if we encounter a 0
int key = m_key_[i] & 0xFF;
int targetkey = target.m_key_[i] & 0xFF;
if (key < targetkey) {
return -1;
}
if (targetkey < key) {
return 1;
}
return 0;
}
/**
* <p>Compares this CollationKey with the specified Object.</p>
* @param obj the Object to be compared.
* @return Returns a negative integer, zero, or a positive integer
* respectively if this CollationKey is less than, equal to, or
* greater than the given Object.
* @exception ClassCastException thrown when the specified Object is not a
* CollationKey.
* @see #compareTo(CollationKey)
* @draft 2.2
*/
public int compareTo(Object obj)
{
return compareTo((CollationKey)obj);
}
/**
* <p>Compare this CollationKey and the target CollationKey for equality.
* </p>
* <p>The collation rules of the Collator object which created these keys
* are applied.</p>
* <p><strong>Note:</strong> CollationKeys created by different Collators
* can not be compared.</p>
* @param target the CollationKey to compare to.
* @return true if two objects are equal, false otherwise.
* @draft 2.2
*/
public boolean equals(Object target)
{
if (this == target) {
return true;
}
if (target == null || !(target instanceof CollationKey)) {
return false;
}
CollationKey other = (CollationKey)target;
int i = 0;
while (true) {
if (m_key_[i] != other.m_key_[i]) {
return false;
}
if (m_key_[i] == 0) {
break;
}
i ++;
}
return true;
}
/**
* <p>Creates a hash code for this CollationKey. The hash value is
* calculated on the key itself, not the String from which the key was
* created. Thus if x and y are CollationKeys, then
* x.hashCode(x) == y.hashCode() if x.equals(y) is true. This allows
* language-sensitive comparison in a hash table.</p>
* <p>See the CollatinKey class description for an example.</p>
* @return the hash value.
* @draft 2.2
*/
public int hashCode()
{
if (m_hashCode_ == 0) {
int size = m_key_.length >> 1;
StringBuffer key = new StringBuffer(size);
int i = 0;
while (m_key_[i] != 0 && m_key_[i + 1] != 0) {
key.append((m_key_[i] << 8) | m_key_[i + 1]);
i += 2;
}
if (m_key_[i] != 0) {
key.append(m_key_[i] << 8);
}
m_hashCode_ = key.hashCode();
}
return m_hashCode_;
}
// protected constructor ------------------------------------------------
/**
* Protected CollationKey can only be generated by Collator objects
* @param source string the CollationKey represents
* @param key sort key array of bytes
* @param size of sort key
* @draft 2v2
*/
CollationKey(String source, byte key[])
{
m_source_ = source;
m_key_ = key;
m_hashCode_ = 0;
}
// private data members -------------------------------------------------
/**
* Source string this CollationKey represents
*/
private String m_source_;
/**
* Sequence of bytes that represents the sort key
*/
private byte m_key_[];
/**
* Hash code for the key
*/
private int m_hashCode_;
}

View file

@ -0,0 +1,454 @@
/**
*******************************************************************************
* Copyright (C) 1996-2002, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Collator.java,v $
* $Date: 2002/05/14 16:48:49 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import java.util.Locale;
/**
* <p>The Collator class performs locale-sensitive String comparison.
* You use this class to build searching and sorting routines for natural
* language text.</p>
* <p>Collator is an abstract base class. Subclasses implement specific
* collation strategies. One subclass, RuleBasedCollator, is currently
* provided and is applicable to a wide set of languages. Other subclasses
* may be created to handle more specialized needs.</p>
* <p>Like other locale-sensitive classes, you can use the static factory
* method, getInstance, to obtain the appropriate Collator object for a given
* locale. You will only need to look at the subclasses of Collator if you need
* to understand the details of a particular collation strategy or if you need
* to modify that strategy. </p>
* <p>The following example shows how to compare two strings using the Collator
* for the default locale.
* <pre>
* // Compare two strings in the default locale
* Collator myCollator = Collator.getInstance();
* if (myCollator.compare("abc", "ABC") < 0) {
* System.out.println("abc is less than ABC");
* }
* else {
* System.out.println("abc is greater than or equal to ABC");
* }
* </pre>
* <p>You can set a <code>Collator</code>'s <em>strength</em> property to
* determine the level of difference considered significant in comparisons.
* Four strengths are provided: <code>PRIMARY</code>, <code>SECONDARY</code>,
* <code>TERTIARY</code>, and <code>IDENTICAL</code>. The exact assignment of
* strengths to language features is locale dependant. For example, in Czech,
* "e" and "f" are considered primary differences, while "e" and "\u00EA" are
* secondary differences, "e" and "E" are tertiary differences and "e" and "e"
* are identical. The following shows how both case and accents could be
* ignored for US English.</p>
* <pre>
* //Get the Collator for US English and set its strength to PRIMARY
* Collator usCollator = Collator.getInstance(Locale.US);
* usCollator.setStrength(Collator.PRIMARY);
* if (usCollator.compare("abc", "ABC") == 0) {
* System.out.println("Strings are equivalent");
* }
* </pre>
* <p>For comparing Strings exactly once, the compare method provides the best
* performance. When sorting a list of Strings however, it is generally
* necessary to compare each String multiple times. In this case,
* CollationKeys provide better performance. The CollationKey class converts a
* String to a series of bits that can be compared bitwise against other
* CollationKeys. A CollationKey is created by a Collator object for a given
* String.</p>
* <p>Note: CollationKeys from different Collators can not be compared. See the
* class description for CollationKey for an example using CollationKeys.
* </p>
* @author Syn Wee Quek
* @since release 2.2, April 18 2002
* @draft 2.2
*/
public abstract class Collator
{
// public data members ---------------------------------------------------
/**
* Collator strength value. When set, only PRIMARY differences are
* considered significant during comparison. The assignment of strengths
* to language features is locale dependant. A common example is for
* different base letters ("a" vs "b") to be considered a PRIMARY
* difference.
* @see #setStrength
* @see #getStrength
* @draft 2.2
*/
public final static int PRIMARY
= RuleBasedCollator.AttributeValue.PRIMARY_;
/**
* Collator strength value. When set, only SECONDARY and above
* differences are considered significant during comparison. The
* assignment of strengths to language features is locale dependant. A
* common example is for different accented forms of the same base letter
* ("a" vs "\u00E4") to be considered a SECONDARY difference.
* @see #setStrength
* @see #getStrength
* @draft 2.2
*/
public final static int SECONDARY
= RuleBasedCollator.AttributeValue.SECONDARY_;
/**
* Collator strength value. When set, only TERTIARY and above differences
* are considered significant during comparison. The assignment of
* strengths to language features is locale dependant. A common example is
* for case differences ("a" vs "A") to be considered a TERTIARY
* difference.
* @see #setStrength
* @see #getStrength
* @draft 2.2
*/
public final static int TERTIARY
= RuleBasedCollator.AttributeValue.TERTIARY_;
/**
* Collator strength value. When set, only QUARTENARY and above differences
* are considered significant during comparison. The assignment of
* strengths to language features is locale dependant.
* difference.
* @see #setStrength
* @see #getStrength
* @draft 2.2
*/
public final static int QUATERNARY
= RuleBasedCollator.AttributeValue.QUATERNARY_;
/**
* <p>Collator strength value. When set, all differences are considered
* significant during comparison. The assignment of strengths to language
* features is locale dependant. A common example is for control
* characters ("&#092;u0001" vs "&#092;u0002") to be considered equal at
* the PRIMARY, SECONDARY, and TERTIARY levels but different at the
* IDENTICAL level. Additionally, differences between pre-composed
* accents such as "&#092;u00C0" (A-grave) and combining accents such as
* "A&#092;u0300" (A, combining-grave) will be considered significant at
* the tertiary level if decomposition is set to NO_DECOMPOSITION.
* </p>
* <p>Note this value is different from JDK's</p>
* @draft 2.2
*/
public final static int IDENTICAL
= RuleBasedCollator.AttributeValue.IDENTICAL_;
/**
* <p>Decomposition mode value. With NO_DECOMPOSITION set, accented
* characters will not be decomposed for collation. This is the default
* setting and provides the fastest collation but will only produce
* correct results for languages that do not use accents.</p>
* <p>Note this value is different from JDK's</p>
* @see #getDecomposition
* @see #setDecomposition
* @draft 2.2
*/
public final static int NO_DECOMPOSITION
= RuleBasedCollator.AttributeValue.OFF_;
/**
* <p>Decomposition mode value. With CANONICAL_DECOMPOSITION set,
* characters that are canonical variants according to Unicode 2.0 will be
* decomposed for collation. This should be used to get correct collation
* of accented characters.</p>
* <p>CANONICAL_DECOMPOSITION corresponds to Normalization Form D as
* described in <a href="http://www.unicode.org/unicode/reports/tr15/">
* Unicode Technical Report #15</a>.</p>
* @see #getDecomposition
* @see #setDecomposition
* @draft 2.2
*/
public final static int CANONICAL_DECOMPOSITION = 1;
/**
* <p>Decomposition mode value. With FULL_DECOMPOSITION set, both Unicode
* canonical variants and Unicode compatibility variants will be
* decomposed for collation. This causes not only accented characters to
* be collated, but also characters that have special formats to be
* collated with their norminal form. For example, the half-width and
* full-width ASCII and Katakana characters are then collated together.
* FULL_DECOMPOSITION is the most complete and therefore the slowest
* decomposition mode.</p>
* <p>
* FULL_DECOMPOSITION corresponds to Normalization Form KD as described in
* <a href="http://www.unicode.org/unicode/reports/tr15/">Unicode
* Technical Report #15</a>.</p>
* @see #getDecomposition
* @see #setDecomposition
* @draft 2.2
*/
public final static int FULL_DECOMPOSITION = 2;
// public methods --------------------------------------------------------
// public setters --------------------------------------------------------
/**
* <p>Sets this Collator's strength property. The strength property
* determines the minimum level of difference considered significant
* during comparison.</p>
* <p>See the Collator class description for an example of use.</p>
* @param the new strength value.
* @see #getStrength
* @see #PRIMARY
* @see #SECONDARY
* @see #TERTIARY
* @see #IDENTICAL
* @exception IllegalArgumentException If the new strength value is not one of
* PRIMARY, SECONDARY, TERTIARY or IDENTICAL.
* @draft 2.2
*/
public synchronized void setStrength(int newStrength) {
if ((newStrength != PRIMARY) &&
(newStrength != SECONDARY) &&
(newStrength != TERTIARY) &&
(newStrength != QUATERNARY) &&
(newStrength != IDENTICAL)) {
throw new IllegalArgumentException("Incorrect comparison level.");
}
m_strength_ = newStrength;
}
/**
* Set the decomposition mode of this Collator. See getDecomposition
* for a description of decomposition mode.
* @param decomposition the new decomposition mode
* @see #getDecomposition
* @see #NO_DECOMPOSITION
* @see #CANONICAL_DECOMPOSITION
* @see #FULL_DECOMPOSITION
* @exception IllegalArgumentException If the given value is not a valid decomposition
* mode.
* @draft 2.2
*/
public synchronized void setDecomposition(int decomposition) {
if ((decomposition != NO_DECOMPOSITION) &&
(decomposition != CANONICAL_DECOMPOSITION) &&
(decomposition != FULL_DECOMPOSITION)) {
throw new IllegalArgumentException("Wrong decomposition mode.");
}
if (decomposition != NO_DECOMPOSITION) {
m_decomposition_ = decomposition;
}
else {
m_decomposition_ = CANONICAL_DECOMPOSITION;
}
}
// public getters --------------------------------------------------------
/**
* Gets the Collator for the current default locale.
* The default locale is determined by java.util.Locale.getDefault().
* @return the Collator for the default locale (for example, en_US) if it
* is created successfully, otherwise if there is a failure,
* null will be returned.
* @see java.util.Locale#getDefault
* @draft 2.2
*/
public static final Collator getInstance()
{
return getInstance(Locale.getDefault());
}
/**
* Gets the Collator for the desired locale.
* @param locale the desired locale.
* @return Collator for the desired locale if it is created successfully,
* otherwise if there is a failure, the default UCA collator will
* be returned.
* @see java.util.Locale
* @see java.util.ResourceBundle
* @draft 2.2
*/
public static final Collator getInstance(Locale locale)
{
try {
return new RuleBasedCollator(locale);
}
catch(Exception e) {
return RuleBasedCollator.UCA_;
}
}
/**
* <p>Returns this Collator's strength property. The strength property
* determines the minimum level of difference considered significant
* during comparison.</p>
* <p>See the Collator class description for an example of use.</p>
* @return this Collator's current strength property.
* @see #setStrength
* @see #PRIMARY
* @see #SECONDARY
* @see #TERTIARY
* @see #IDENTICAL
* @draft 2.2
*/
public int getStrength()
{
return m_strength_;
}
/**
* <p>Get the decomposition mode of this Collator. Decomposition mode
* determines how Unicode composed characters are handled. Adjusting
* decomposition mode allows the user to select between faster and more
* complete collation behavior.
* <p>The three values for decomposition mode are:
* <UL>
* <LI>NO_DECOMPOSITION,
* <LI>CANONICAL_DECOMPOSITION
* <LI>FULL_DECOMPOSITION.
* </UL>
* See the documentation for these three constants for a description
* of their meaning.
* </p>
* @return the decomposition mode
* @see #setDecomposition
* @see #NO_DECOMPOSITION
* @see #CANONICAL_DECOMPOSITION
* @see #FULL_DECOMPOSITION
* @draft 2.2
*/
public int getDecomposition()
{
return m_decomposition_;
}
// public other methods -------------------------------------------------
/**
* Convenience method for comparing the equality of two strings based on
* this Collator's collation rules.
* @param source the source string to be compared with.
* @param target the target string to be compared with.
* @return true if the strings are equal according to the collation
* rules. false, otherwise.
* @see #compare
* @draft 2.2
*/
public boolean equals(String source, String target)
{
return (compare(source, target) == 0);
}
/**
* Cloning this Collator.
* @return a cloned Collator of this object
* @draft 2.2
*/
public Object clone()
{
try {
return (Collator)super.clone();
} catch (CloneNotSupportedException e) {
throw new InternalError();
}
}
/**
* Compares the equality of two Collators.
* @param that the Collator to be compared with this.
* @return true if this Collator is the same as that Collator;
* false otherwise.
* @draft 2.2
*/
public boolean equals(Object that)
{
if (this == that) {
return true;
}
if (that == null || getClass() != that.getClass()) {
return false;
}
Collator other = (Collator) that;
return ((m_strength_ == other.m_strength_) &&
(m_decomposition_ == other.m_decomposition_));
}
// public abstract methods -----------------------------------------------
/**
* Generates the hash code for this Collator.
* @draft 2.2
*/
public abstract int hashCode();
/**
* <p>Compares the source string to the target string according to the
* collation rules for this Collator. Returns an integer less than, equal
* to or greater than zero depending on whether the source String is less
* than, equal to or greater than the target string. See the Collator
* class description for an example of use.</p>
* <p>For a one time comparison, this method has the best performance. If
* a given String will be involved in multiple comparisons,
* CollationKey.compareTo() has the best performance. See the Collator
* class description for an example using CollationKeys.</p>
* @param source the source string.
* @param target the target string.
* @return Returns an integer value. Value is less than zero if source is
* less than target, value is zero if source and target are equal,
* value is greater than zero if source is greater than target.
* @see CollationKey
* @see #getCollationKey
* @draft 2.2
*/
public abstract int compare(String source, String target);
/**
* <p>Transforms the String into a series of bits that can be compared
* bitwise to other CollationKeys. CollationKeys provide better
* performance than Collator.compare() when Strings are involved in
* multiple comparisons.</p>
* <p>See the Collator class description for an example using
* CollationKeys.</p>
* @param source the string to be transformed into a collation key.
* @return the CollationKey for the given String based on this Collator's
* collation rules. If the source String is null, a null
* CollationKey is returned.
* @see CollationKey
* @see #compare(String, String)
* @draft 2.2
*/
public abstract CollationKey getCollationKey(String source);
// protected data members ------------------------------------------------
/**
* Collation strength
*/
protected int m_strength_;
/**
* Decomposition mode
*/
protected int m_decomposition_;
// protected constructor -------------------------------------------------
/**
* <p>Protected constructor for use by subclasses.
* Public access to creating Collators is handled by the API getInstance().
* </p>
* @draft 2.2
*/
protected Collator() throws Exception
{
m_strength_ = TERTIARY;
m_decomposition_ = CANONICAL_DECOMPOSITION;
}
// protected methods -----------------------------------------------------
// private variables -----------------------------------------------------
// private methods -------------------------------------------------------
}

View file

@ -0,0 +1,284 @@
/**
*******************************************************************************
* Copyright (C) 1996-2002, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollatorReader.java,v $
* $Date: 2002/05/14 16:48:49 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.IntTrie;
/**
* <p>Internal reader class for ICU data file uca.dat containing
* Unicode Collation Algorithm data.</p>
* <p>This class simply reads uca.dat, authenticates that it is a valid
* ICU data file and split its contents up into blocks of data for use in
* <a href=Collator.html>com.ibm.icu.text.Collator</a>.
* </p>
* <p>uca.dat which is in big-endian format is jared together with this
* package.</p>
* @author Syn Wee Quek
* @since release 2.2, April 18 2002
* @draft 2.2
*/
final class CollatorReader
{
// protected constructor ---------------------------------------------
/**
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
* @draft 2.1
*/
protected CollatorReader(InputStream inputStream) throws IOException
{
ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_,
DATA_FORMAT_VERSION_, UNICODE_VERSION_);
m_dataInputStream_ = new DataInputStream(inputStream);
}
/**
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @param readICUHeader flag to indicate if the ICU header has to be read
* @exception IOException throw if data file fails authentication
* @draft 2.1
*/
protected CollatorReader(InputStream inputStream, boolean readICUHeader)
throws IOException
{
if (readICUHeader) {
ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_,
DATA_FORMAT_VERSION_, UNICODE_VERSION_);
}
m_dataInputStream_ = new DataInputStream(inputStream);
}
// protected methods -------------------------------------------------
/**
* Read and break up the header stream of data passed in as arguments into
* meaningful Collator data.
* @param rbc RuleBasedCollator to populate with header information
* @exception IOException thrown when there's a data error.
*/
protected void readHeader(RuleBasedCollator rbc) throws IOException
{
int size = m_dataInputStream_.readInt();
// all the offsets are in bytes
// to get the address add to the header address and cast properly
// Default options int options
m_dataInputStream_.skipBytes(4);
// this one is needed only for UCA, to copy the appropriate
// contractions
m_dataInputStream_.skipBytes(4);
// reserved for future use
m_dataInputStream_.readInt();
// const uint8_t *mappingPosition;
int mapping = m_dataInputStream_.readInt();
// uint32_t *expansion;
rbc.m_expansionOffset_ = m_dataInputStream_.readInt();
// UChar *contractionIndex;
rbc.m_contractionOffset_ = m_dataInputStream_.readInt();
// uint32_t *contractionCEs;
int contractionCE = m_dataInputStream_.readInt();
// needed for various closures int contractionSize
m_dataInputStream_.skipBytes(4);
// array of last collation element in expansion
int expansionEndCE = m_dataInputStream_.readInt();
// array of maximum expansion size corresponding to the expansion
// collation elements with last element in expansionEndCE
int expansionEndCEMaxSize = m_dataInputStream_.readInt();
// size of endExpansionCE int expansionEndCESize
m_dataInputStream_.skipBytes(4);
// hash table of unsafe code points
int unsafe = m_dataInputStream_.readInt();
// hash table of final code points in contractions.
int contractionEnd = m_dataInputStream_.readInt();
// int CEcount = m_dataInputStream_.readInt();
m_dataInputStream_.skipBytes(4);
// is jamoSpecial
rbc.m_isJamoSpecial_ = m_dataInputStream_.readBoolean();
m_dataInputStream_.skipBytes(3);
// byte version[] = new byte[4];
m_dataInputStream_.skipBytes(4);
// byte charsetName[] = new byte[32]; // for charset CEs
m_dataInputStream_.skipBytes(32);
m_dataInputStream_.skipBytes(64); // for future use
if (rbc.m_contractionOffset_ == 0) { // contraction can be null
rbc.m_contractionOffset_ = mapping;
contractionCE = mapping;
}
m_expansionSize_ = rbc.m_contractionOffset_ - rbc.m_expansionOffset_;
m_contractionIndexSize_ = contractionCE - rbc.m_contractionOffset_;
m_contractionCESize_ = mapping - contractionCE;
m_trieSize_ = expansionEndCE - mapping;
m_expansionEndCESize_ = expansionEndCEMaxSize - expansionEndCE;
m_expansionEndCEMaxSizeSize_ = unsafe - expansionEndCEMaxSize;
m_unsafeSize_ = contractionEnd - unsafe;
m_contractionEndSize_ = size - contractionEnd;
rbc.m_contractionOffset_ >>= 1; // casting to ints
rbc.m_expansionOffset_ >>= 2; // casting to chars
}
/**
* Read and break up the collation options passed in the stream of data
* and update the argument Collator with the results
* @param rbc RuleBasedCollator to populate
* @exception IOException thrown when there's a data error.
* @draft 2.2
*/
public void readOptions(RuleBasedCollator rbc) throws IOException
{
rbc.m_variableTopValue_ = m_dataInputStream_.readInt();
rbc.setAttributeDefault(RuleBasedCollator.Attribute.FRENCH_COLLATION_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(
RuleBasedCollator.Attribute.ALTERNATE_HANDLING_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_FIRST_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(RuleBasedCollator.Attribute.CASE_LEVEL_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(
RuleBasedCollator.Attribute.NORMALIZATION_MODE_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(RuleBasedCollator.Attribute.STRENGTH_,
m_dataInputStream_.readInt());
rbc.setAttributeDefault(
RuleBasedCollator.Attribute.HIRAGANA_QUATERNARY_MODE_,
m_dataInputStream_.readInt());
}
/**
* Read and break up the stream of data passed in as arguments into
* meaningful Collator data.b
* @param rbc RuleBasedCollator to populate
* @exception IOException thrown when there's a data error.
* @draft 2.2
*/
public void read(RuleBasedCollator rbc) throws IOException
{
readHeader(rbc);
readOptions(rbc);
m_expansionSize_ >>= 2;
rbc.m_expansion_ = new int[m_expansionSize_];
for (int i = 0; i < m_expansionSize_; i ++) {
rbc.m_expansion_[i] = m_dataInputStream_.readInt();
}
m_contractionIndexSize_ >>= 1;
rbc.m_contractionIndex_ = new char[m_contractionIndexSize_];
for (int i = 0; i < m_contractionIndexSize_; i ++) {
rbc.m_contractionIndex_[i] = m_dataInputStream_.readChar();
}
m_contractionCESize_ >>= 2;
rbc.m_contractionCE_ = new int[m_contractionCESize_];
for (int i = 0; i < m_contractionCESize_; i ++) {
rbc.m_contractionCE_[i] = m_dataInputStream_.readInt();
}
rbc.m_trie_ = new IntTrie(m_dataInputStream_, rbc);
if (!rbc.m_trie_.isLatin1Linear()) {
throw new IOException("Data corrupted, "
+ "Collator Tries expected to have linear "
+ "latin one data arrays");
}
m_expansionEndCESize_ >>= 2;
rbc.m_expansionEndCE_ = new int[m_expansionEndCESize_];
for (int i = 0; i < m_expansionEndCESize_; i ++) {
rbc.m_expansionEndCE_[i] = m_dataInputStream_.readInt();
}
rbc.m_expansionEndCEMaxSize_ = new byte[m_expansionEndCEMaxSizeSize_];
for (int i = 0; i < m_expansionEndCEMaxSizeSize_; i ++) {
rbc.m_expansionEndCEMaxSize_[i] = m_dataInputStream_.readByte();
}
rbc.m_unsafe_ = new byte[m_unsafeSize_];
for (int i = 0; i < m_unsafeSize_; i ++) {
rbc.m_unsafe_[i] = m_dataInputStream_.readByte();
}
rbc.m_contractionEnd_ = new byte[m_contractionEndSize_];
for (int i = 0; i < m_contractionEndSize_; i ++) {
rbc.m_contractionEnd_[i] = m_dataInputStream_.readByte();
}
}
// private variables -------------------------------------------------
/**
* Data input stream for uca.dat
*/
private DataInputStream m_dataInputStream_;
/**
* File format version and id that this class understands.
* No guarantees are made if a older version is used
*/
private static final byte DATA_FORMAT_VERSION_[] =
{(byte)0x2, (byte)0x0, (byte)0x0, (byte)0x0};
private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x43,
(byte)0x6f, (byte)0x6c};
private static final byte UNICODE_VERSION_[] = {(byte)0x3, (byte)0x0,
(byte)0x0, (byte)0x0};
/**
* Corrupted error string
*/
private static final String CORRUPTED_DATA_ERROR_ =
"Data corrupted in Collation data file";
/**
* Size of expansion table in bytes
*/
private int m_expansionSize_;
/**
* Size of contraction index table in bytes
*/
private int m_contractionIndexSize_;
/**
* Size of contraction table in bytes
*/
private int m_contractionCESize_;
/**
* Size of the Trie in bytes
*/
private int m_trieSize_;
/**
* Size of the table that contains information about collation elements
* that end with an expansion
*/
private int m_expansionEndCESize_;
/**
* Size of the table that contains information about the maximum size of
* collation elements that end with a particular expansion CE corresponding
* to the ones in expansionEndCE
*/
private int m_expansionEndCEMaxSizeSize_;
/**
* Size of the table that contains information about the "Unsafe"
* codepoints
*/
private int m_unsafeSize_;
/**
* Size of the table that contains information about codepoints that ends
* with a contraction
*/
private int m_contractionEndSize_;
/**
* Size of the table that contains UCA contraction information
*/
private int m_UCAContractionSize_;
// private methods ---------------------------------------------------
}

File diff suppressed because it is too large Load diff