From 51632583391d675f9fd4d33092d9544e8d360ea2 Mon Sep 17 00:00:00 2001 From: Syn Wee Quek Date: Thu, 19 Sep 2002 21:19:04 +0000 Subject: [PATCH] ICU-2285 * clean up code * added iso comments * added get max names length * added get names set * shifted UCharacterName and related code to impl X-SVN-Rev: 9892 --- .../src/com/ibm/icu/impl/UCharacterName.java | 1710 +++++++++++++++++ .../{lang => impl}/UCharacterNameChoice.java | 17 +- .../{lang => impl}/UCharacterNameReader.java | 8 +- .../UCharacterUtility.java} | 67 +- icu4j/src/com/ibm/icu/lang/UCharacter.java | 81 +- .../com/ibm/icu/lang/UCharacterCategory.java | 63 +- .../src/com/ibm/icu/lang/UCharacterName.java | 1181 ------------ .../ibm/icu/lang/UCharacterNameIterator.java | 14 +- 8 files changed, 1816 insertions(+), 1325 deletions(-) create mode 100644 icu4j/src/com/ibm/icu/impl/UCharacterName.java rename icu4j/src/com/ibm/icu/{lang => impl}/UCharacterNameChoice.java (73%) mode change 100755 => 100644 rename icu4j/src/com/ibm/icu/{lang => impl}/UCharacterNameReader.java (97%) mode change 100755 => 100644 rename icu4j/src/com/ibm/icu/{lang/UCharacterUtil.java => impl/UCharacterUtility.java} (72%) mode change 100755 => 100644 delete mode 100755 icu4j/src/com/ibm/icu/lang/UCharacterName.java diff --git a/icu4j/src/com/ibm/icu/impl/UCharacterName.java b/icu4j/src/com/ibm/icu/impl/UCharacterName.java new file mode 100644 index 00000000000..c019adb1b94 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/UCharacterName.java @@ -0,0 +1,1710 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: +* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterName.java $ +* $Date: 2002/09/19 21:19:04 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ +package com.ibm.icu.impl; + +import java.io.InputStream; +import java.io.BufferedInputStream; +import java.io.IOException; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UCharacterCategory; + +/** +* Internal class to manage character names. +* Since data in UCharacterNameDB is stored +* in an array of char, by default indexes used in this class is refering to +* a 2 byte count, unless otherwise stated. Cases where the index is refering +* to a byte count, the index is halved and depending on whether the index is +* even or odd, the MSB or LSB of the result char at the halved index is +* returned. For indexes to an array of int, the index is multiplied by 2, +* result char at the multiplied index and its following char is returned as an +* int. +* UCharacter acts as a public facade for this class +* Note : 0 - 0x1F are control characters without names in Unicode 3.0 +* Information on parsing of the binary data is located at +* +* ReadMe +* @author Syn Wee Quek +* @since nov0700 +*/ + +public final class UCharacterName +{ + // public data members ---------------------------------------------- + + /** + * Number of lines per group + * 1 << GROUP_SHIFT_ + */ + public static final int LINES_PER_GROUP_ = 1 << 5; + /** + * Maximum number of groups + */ + public int m_groupcount_ = 0; + + // public methods --------------------------------------------------- + + /** + * Gets the only instance of UCharacterName + * @return only instance of UCharacterName + * @exception RuntimeException thrown when reading of name data fails + */ + public static UCharacterName getInstance() throws RuntimeException + { + if (INSTANCE_ == null) { + try { + INSTANCE_ = new UCharacterName(); + } + catch (Exception e) { + throw new RuntimeException(e.getMessage()); + } + } + return INSTANCE_; + } + + /** + * Retrieve the name of a Unicode code point. + * Depending on choice, the character name written into the + * buffer is the "modern" name or the name that was defined in Unicode + * version 1.0. + * The name contains only "invariant" characters + * like A-Z, 0-9, space, and '-'. + * + * @param ch the code point for which to get the name. + * @param choice Selector for which name to get. + * @return if code point is above 0x1fff, null is returned + */ + public String getName(int ch, int choice) + { + if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE || + choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) { + return null; + } + + String result = null; + + result = getAlgName(ch, choice); + + // getting normal character name + if (result == null || result.length() == 0) { + if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { + result = getExtendedName(ch); + } else { + result = getGroupName(ch, choice); + } + } + + return result; + } + + /** + * Find a character by its name and return its code point value + * @param character name + * @param choice selector to indicate if argument name is a Unicode 1.0 + * or the most current version + * @return code point + */ + public int getCharFromName(int choice, String name) + { + // checks for illegal arguments + if (choice >= UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT || + name == null || name.length() == 0) { + return -1; + } + + // try extended names first + int result = getExtendedChar(name.toLowerCase(), choice); + if (result >= -1) { + return result; + } + + String upperCaseName = name.toUpperCase(); + // try algorithmic names first, if fails then try group names + // int result = getAlgorithmChar(choice, uppercasename); + + if (choice != UCharacterNameChoice.UNICODE_10_CHAR_NAME) { + int count = 0; + if (m_algorithm_ != null) { + count = m_algorithm_.length; + } + for (count --; count >= 0; count --) { + result = m_algorithm_[count].getChar(upperCaseName); + if (result >= 0) { + return result; + } + } + } + + if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { + result = getGroupChar(upperCaseName, + UCharacterNameChoice.UNICODE_CHAR_NAME); + if (result == -1) { + result = getGroupChar(upperCaseName, + UCharacterNameChoice.UNICODE_10_CHAR_NAME); + } + } + else { + result = getGroupChar(upperCaseName, choice); + } + return result; + } + + // these are all UCharacterNameIterator use methods ------------------- + + /** + * Reads a block of compressed lengths of 32 strings and expands them into + * offsets and lengths for each string. Lengths are stored with a + * variable-width encoding in consecutive nibbles: + * If a nibble<0xc, then it is the length itself (0 = empty string). + * If a nibble>=0xc, then it forms a length value with the following + * nibble. + * The offsets and lengths arrays must be at least 33 (one more) long + * because there is no check here at the end if the last nibble is still + * used. + * @param index of group string object in array + * @param offsets array to store the value of the string offsets + * @param lengths array to store the value of the string length + * @return next index of the data string immediately after the lengths + * in terms of byte address + */ + public int getGroupLengths(int index, char offsets[], char lengths[]) + { + char length = 0xffff; + byte b = 0, + n = 0; + int shift; + index = index * m_groupsize_; // byte count offsets of group strings + int stringoffset = UCharacterUtility.toInt( + m_groupinfo_[index + OFFSET_HIGH_OFFSET_], + m_groupinfo_[index + OFFSET_LOW_OFFSET_]); + + offsets[0] = 0; + + // all 32 lengths must be read to get the offset of the first group + // string + for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) { + b = m_groupstring_[stringoffset]; + shift = 4; + + while (shift >= 0) { + // getting nibble + n = (byte)((b >> shift) & 0x0F); + if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) { + length = (char)((n - 12) << 4); + } + else { + if (length != 0xffff) { + lengths[i] = (char)((length | n) + 12); + } + else { + lengths[i] = (char)n; + } + + if (i < LINES_PER_GROUP_) { + offsets[i + 1] = (char)(offsets[i] + lengths[i]); + } + + length = 0xffff; + i ++; + } + + shift -= 4; + } + } + return stringoffset; + } + + /** + * Gets the name of the argument group index. + * UnicodeData.txt uses ';' as a field separator, so no field can contain + * ';' as part of its contents. In unames.icu, it is marked as + * token[';'] == -1 only if the semicolon is used in the data file - which + * is iff we have Unicode 1.0 names or ISO comments. + * So, it will be token[';'] == -1 if we store U1.0 names/ISO comments + * although we know that it will never be part of a name. + * Equivalent to ICU4C's expandName. + * @param index of the group name string in byte count + * @param length of the group name string + * @param choice of Unicode 1.0 name or the most current name + * @return name of the group + */ + public String getGroupName(int index, int length, int choice) + { + if (choice == UCharacterNameChoice.UNICODE_10_CHAR_NAME + || choice == UCharacterNameChoice.ISO_COMMENT_) { + if (';' >= m_tokentable_.length || m_tokentable_[';'] == 0xFFFF) { + // skip the modern name + int oldindex = index; + index += UCharacterUtility.skipByteSubString(m_groupstring_, + index, length, (byte)';'); + length -= (index - oldindex); + if (choice == UCharacterNameChoice.ISO_COMMENT_) { + // skips the 1.0 Name to the iso comment part + oldindex = index; + index += UCharacterUtility.skipByteSubString(m_groupstring_, + index, length, (byte)';'); + length -= (index - oldindex); + } + } + else { + // the semicolon byte is a token number, therefore only modern + // names are stored in unames.dat and there is no such + // requested Unicode 1.0 name here + length = 0; + } + } + + synchronized (m_utilStringBuffer_) { + m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); + byte b; + char token; + for (int i = 0; i < length;) { + b = m_groupstring_[index + i]; + i ++; + + if (b >= m_tokentable_.length) { + if (b == ';') { + break; + } + m_utilStringBuffer_.append(b); // implicit letter + } + else { + token = m_tokentable_[b & 0x00ff]; + if (token == 0xFFFE) { + // this is a lead byte for a double-byte token + token = m_tokentable_[b << 8 | + (m_groupstring_[index + i] & 0x00ff)]; + i ++; + } + if (token == 0xFFFF) { + if (b == ';') { + // skip the semicolon if we are seeking extended + // names and there was no 2.0 name but there + // is a 1.0 name. + if (m_utilStringBuffer_.length() == 0 && choice == + UCharacterNameChoice.EXTENDED_CHAR_NAME) { + continue; + } + break; + } + // explicit letter + m_utilStringBuffer_.append((char)(b & 0x00ff)); + } + else { // write token word + UCharacterUtility.getNullTermByteSubString( + m_utilStringBuffer_, m_tokenstring_, token); + } + } + } + + if (m_utilStringBuffer_.length() > 0) { + return m_utilStringBuffer_.toString(); + } + } + return null; + } + + /** + * Retrieves the extended name + */ + public String getExtendedName(int ch) + { + String result = getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME); + if (result == null) { + if (getType(ch) == UCharacterCategory.CONTROL) { + result = getName(ch, + UCharacterNameChoice.UNICODE_10_CHAR_NAME); + } + if (result == null) { + result = getExtendedOr10Name(ch); + } + } + return result; + } + + /** + * Gets the group index for the codepoint, or the group before it. + * @param codepoint + * @return group index containing codepoint or the group before it. + */ + public int getGroup(int codepoint) + { + int endGroup = m_groupcount_; + int msb = getCodepointMSB(codepoint); + int result = 0; + // binary search for the group of names that contains the one for + // code + // find the group that contains codepoint, or the highest before it + while (result < endGroup - 1) { + int gindex = (result + endGroup) >> 1; + if (msb < getGroupMSB(gindex)) { + endGroup = gindex; + } + else { + result = gindex; + } + } + return result; + } + + /** + * Gets the extended and 1.0 name when the most current unicode names + * fail + * @param ch codepoint + * @return name of codepoint extended or 1.0 + */ + public String getExtendedOr10Name(int ch) + { + String result = null; + if (getType(ch) == UCharacterCategory.CONTROL) { + result = getName(ch, + UCharacterNameChoice.UNICODE_10_CHAR_NAME); + } + if (result == null) { + int type = getType(ch); + // Return unknown if the table of names above is not up to + // date. + if (type >= TYPE_NAMES_.length) { + result = UNKNOWN_TYPE_NAME_; + } + else { + result = TYPE_NAMES_[type]; + } + synchronized (m_utilStringBuffer_) { + m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); + m_utilStringBuffer_.append('<'); + m_utilStringBuffer_.append(result); + m_utilStringBuffer_.append('-'); + String chStr = Integer.toHexString(ch).toUpperCase(); + int zeros = 4 - chStr.length(); + while (zeros > 0) { + m_utilStringBuffer_.append('0'); + zeros --; + } + m_utilStringBuffer_.append(chStr); + m_utilStringBuffer_.append('>'); + result = m_utilStringBuffer_.toString(); + } + } + return result; + } + + /** + * Gets the MSB from the group index + * @param gindex group index + * @return the MSB of the group if gindex is valid, -1 otherwise + */ + public int getGroupMSB(int gindex) + { + if (gindex >= m_groupcount_) { + return -1; + } + return m_groupinfo_[gindex * m_groupsize_]; + } + + /** + * Gets the MSB of the codepoint + * @param codepoint + * @return the MSB of the codepoint + */ + public static int getCodepointMSB(int codepoint) + { + return codepoint >> GROUP_SHIFT_; + } + + /** + * Gets the maximum codepoint + 1 of the group + * @param msb most significant byte of the group + * @return limit codepoint of the group + */ + public static int getGroupLimit(int msb) + { + return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_; + } + + /** + * Gets the minimum codepoint of the group + * @param msb most significant byte of the group + * @return minimum codepoint of the group + */ + public static int getGroupMin(int msb) + { + return msb << GROUP_SHIFT_; + } + + /** + * Gets the offset to a group + * @param codepoint + * @return offset to a group + */ + public static int getGroupOffset(int codepoint) + { + return codepoint & GROUP_MASK_; + } + + /** + * Gets the minimum codepoint of a group + * @param codepoint + * @return minimum codepoint in the group which codepoint belongs to + */ + public static int getGroupMinFromCodepoint(int codepoint) + { + return codepoint & ~GROUP_MASK_; + } + + /** + * Get the Algorithm range length + * @return Algorithm range length + */ + public int getAlgorithmLength() + { + return m_algorithm_.length; + } + + /** + * Gets the start of the range + * @param index algorithm index + * @return algorithm range start + */ + public int getAlgorithmStart(int index) + { + return m_algorithm_[index].m_rangestart_; + } + + /** + * Gets the end of the range + * @param index algorithm index + * @return algorithm range end + */ + public int getAlgorithmEnd(int index) + { + return m_algorithm_[index].m_rangeend_; + } + + /** + * Gets the Algorithmic name of the codepoint + * @param index algorithmic range index + * @param codepoint + * @return algorithmic name of codepoint + */ + public String getAlgorithmName(int index, int codepoint) + { + String result = null; + synchronized (m_utilStringBuffer_) { + m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); + m_algorithm_[index].appendName(codepoint, m_utilStringBuffer_); + result = m_utilStringBuffer_.toString(); + } + return result; + } + + // these are transliterator use methods --------------------------------- + + /** + * Gets the maximum length of any codepoint name. + * Equivalent to uprv_getMaxCharNameLength. + * @return the maximum length of any codepoint name + */ + public int getMaxCharNameLength() + { + if (initNameSetsLengths()) { + return m_maxNameLength_; + } + else { + return 0; + } + } + + /** + * Gets the maximum length of any iso comments. + * Equivalent to uprv_getMaxISOCommentLength. + * @return the maximum length of any codepoint name + */ + public int getMaxISOCommentLength() + { + if (initNameSetsLengths()) { + return m_maxISOCommentLength_; + } + else { + return 0; + } + } + + /** + * Fills set with characters that are used in Unicode character names. + * Equivalent to uprv_getCharNameCharacters. + * @param set USet to receive characters. Existing contents are deleted. + */ + public void getCharNameCharacters(UnicodeSet set) + { + convert(m_nameSet_, set); + } + + /** + * Fills set with characters that are used in Unicode character names. + * Equivalent to uprv_getISOCommentCharacters. + * @param set USet to receive characters. Existing contents are deleted. + */ + public void getISOCommentCharacters(UnicodeSet set) + { + convert(m_ISOCommentSet_, set); + } + + // package private inner class -------------------------------------- + + /** + * Algorithmic name class + */ + static final class AlgorithmName + { + // package private data members ---------------------------------- + + /** + * Constant type value of the different AlgorithmName + */ + static final int TYPE_0_ = 0; + static final int TYPE_1_ = 1; + + // package private constructors ---------------------------------- + + /** + * Constructor + */ + AlgorithmName() + { + } + + // package private methods --------------------------------------- + + /** + * Sets the information for accessing the algorithmic names + * @param rangestart starting code point that lies within this name group + * @param rangeend end code point that lies within this name group + * @param type algorithm type. There's 2 kinds of algorithmic type. First + * which uses code point as part of its name and the other uses + * variant postfix strings + * @param variant algorithmic variant + * @return true if values are valid + */ + boolean setInfo(int rangestart, int rangeend, byte type, byte variant) + { + if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend + && rangeend <= UCharacter.MAX_VALUE && + (type == TYPE_0_ || type == TYPE_1_)) { + m_rangestart_ = rangestart; + m_rangeend_ = rangeend; + m_type_ = type; + m_variant_ = variant; + return true; + } + return false; + } + + /** + * Sets the factor data + * @param array of factor + * @return true if factors are valid + */ + boolean setFactor(char factor[]) + { + if (factor.length == m_variant_) { + m_factor_ = factor; + return true; + } + return false; + } + + /** + * Sets the name prefix + * @param prefix + * @return true if prefix is set + */ + boolean setPrefix(String prefix) + { + if (prefix != null && prefix.length() > 0) { + m_prefix_ = prefix; + return true; + } + return false; + } + + /** + * Sets the variant factorized name data + * @param string variant factorized name data + * @return true if values are set + */ + boolean setFactorString(byte string[]) + { + // factor and variant string can be empty for things like + // hanggul code points + m_factorstring_ = string; + return true; + } + + /** + * Checks if code point lies in Algorithm object at index + * @param ch code point + */ + boolean contains(int ch) + { + return m_rangestart_ <= ch && ch <= m_rangeend_; + } + + /** + * Appends algorithm name of code point into StringBuffer. + * Note this method does not check for validity of code point in Algorithm, + * result is undefined if code point does not belong in Algorithm. + * @param ch code point + * @param str StringBuffer to append to + */ + void appendName(int ch, StringBuffer str) + { + str.append(m_prefix_); + switch (m_type_) + { + case TYPE_0_: + // prefix followed by hex digits indicating variants + Utility.hex(ch, m_variant_, str); + break; + case TYPE_1_: + // prefix followed by factorized-elements + int offset = ch - m_rangestart_; + int indexes[] = m_utilIntBuffer_; + int factor; + + // write elements according to the factors + // the factorized elements are determined by modulo + // arithmetic + synchronized (m_utilIntBuffer_) { + for (int i = m_variant_ - 1; i > 0; i --) + { + factor = m_factor_[i] & 0x00FF; + indexes[i] = offset % factor; + offset /= factor; + } + + // we don't need to calculate the last modulus because + // start <= code <= end guarantees here that + // code <= factors[0] + indexes[0] = offset; + + // joining up the factorized strings + str.append(getFactorString(indexes, m_variant_)); + } + break; + } + } + + /** + * Gets the character for the argument algorithmic name + * @return the algorithmic char or -1 otherwise. + */ + int getChar(String name) + { + int prefixlen = m_prefix_.length(); + if (name.length() < prefixlen || + !m_prefix_.equals(name.substring(0, prefixlen))) { + return -1; + } + + switch (m_type_) + { + case TYPE_0_ : + try + { + int result = Integer.parseInt(name.substring(prefixlen), + 16); + // does it fit into the range? + if (m_rangestart_ <= result && result <= m_rangeend_) { + return result; + } + } + catch (NumberFormatException e) + { + return -1; + } + break; + case TYPE_1_ : + // repetitative suffix name comparison done here + // offset is the character code - start + for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++) + { + int offset = ch - m_rangestart_; + int indexes[] = m_utilIntBuffer_; + int factor; + + // write elements according to the factors + // the factorized elements are determined by modulo + // arithmetic + synchronized (m_utilIntBuffer_) { + for (int i = m_variant_ - 1; i > 0; i --) + { + factor = m_factor_[i] & 0x00FF; + indexes[i] = offset % factor; + offset /= factor; + } + + // we don't need to calculate the last modulus + // because start <= code <= end guarantees here that + // code <= factors[0] + indexes[0] = offset; + + // joining up the factorized strings + if (compareFactorString(indexes, m_variant_, name, + prefixlen)) { + return ch; + } + } + } + } + + return -1; + } + + /** + * Adds all chars in the set of algorithmic names into the set. + * Equivalent to part of calcAlgNameSetsLengths. + * @param set int set to add the chars of the algorithm names into + * @param maxlength maximum length to compare to + * @return the length that is either maxlength of the length of this + * algorithm name if it is longer than maxlength + */ + int add(int set[], int maxlength) + { + // prefix length + int length = UCharacterName.add(set, m_prefix_); + switch (m_type_) { + case TYPE_0_ : { + // name = prefix + (range->variant times) hex-digits + // prefix + length += m_variant_; + /* synwee to check + * addString(set, (const char *)(range + 1)) + + range->variant;*/ + break; + } + case TYPE_1_ : { + // name = prefix factorized-elements + // get the set and maximum factor suffix length for each + // factor + for (int i = m_variant_ - 1; i > 0; i --) + { + int maxfactorlength = 0; + int count = 0; + for (int factor = m_factor_[i]; factor > 0; -- factor) { + synchronized (m_utilStringBuffer_) { + m_utilStringBuffer_.delete(0, + m_utilStringBuffer_.length()); + count + = UCharacterUtility.getNullTermByteSubString( + m_utilStringBuffer_, + m_factorstring_, count); + UCharacterName.add(set, m_utilStringBuffer_); + if (m_utilStringBuffer_.length() + > maxfactorlength) + { + maxfactorlength + = m_utilStringBuffer_.length(); + } + } + } + length += maxfactorlength; + } + } + } + if (length > maxlength) { + return length; + } + return maxlength; + } + + // private data members ------------------------------------------ + + /** + * Algorithmic data information + */ + private int m_rangestart_; + private int m_rangeend_; + private byte m_type_; + private byte m_variant_; + private char m_factor_[]; + private String m_prefix_; + private byte m_factorstring_[]; + /** + * Utility StringBuffer + */ + private StringBuffer m_utilStringBuffer_ = new StringBuffer(); + /** + * Utility int buffer + */ + private int m_utilIntBuffer_[] = new int[256]; + + // private methods ----------------------------------------------- + + /** + * Gets the indexth string in each of the argument factor block + * @param index array with each index corresponding to each factor block + * @param length length of the array index + * @return the combined string of the array of indexth factor string in + * factor block + */ + private String getFactorString(int index[], int length) + { + int size = m_factor_.length; + if (index == null || length != size) { + return null; + } + + synchronized (m_utilStringBuffer_) { + m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); + int count = 0; + int factor; + size --; + for (int i = 0; i <= size; i ++) { + factor = m_factor_[i]; + count = UCharacterUtility.skipNullTermByteSubString( + m_factorstring_, count, index[i]); + count = UCharacterUtility.getNullTermByteSubString( + m_utilStringBuffer_, m_factorstring_, + count); + if (i != size) { + count = UCharacterUtility.skipNullTermByteSubString( + m_factorstring_, count, + factor - index[i] - 1); + } + } + return m_utilStringBuffer_.toString(); + } + } + + /** + * Compares the indexth string in each of the argument factor block with + * the argument string + * @param index array with each index corresponding to each factor block + * @param length index array length + * @param str string to compare with + * @param offset of str to start comparison + * @return true if string matches + */ + private boolean compareFactorString(int index[], int length, String str, + int offset) + { + int size = m_factor_.length; + if (index == null || length != size) + return false; + + int count = 0; + int strcount = offset; + int factor; + size --; + for (int i = 0; i <= size; i ++) + { + factor = m_factor_[i]; + count = UCharacterUtility.skipNullTermByteSubString( + m_factorstring_, count, index[i]); + strcount = UCharacterUtility.compareNullTermByteSubString(str, + m_factorstring_, strcount, count); + if (strcount < 0) { + return false; + } + + if (i != size) { + count = UCharacterUtility.skipNullTermByteSubString( + m_factorstring_, count, factor - index[i]); + } + } + if (strcount != str.length()) { + return false; + } + return true; + } + } + + // package private data members -------------------------------------- + + /** + * Size of each groups + */ + int m_groupsize_ = 0; + + // package private methods -------------------------------------------- + + /** + * Sets the token data + * @param token array of tokens + * @param tokenstring array of string values of the tokens + * @return false if there is a data error + */ + boolean setToken(char token[], byte tokenstring[]) + { + if (token != null && tokenstring != null && token.length > 0 && + tokenstring.length > 0) { + m_tokentable_ = token; + m_tokenstring_ = tokenstring; + return true; + } + return false; + } + + /** + * Set the algorithm name information array + * @param algorithm information array + * @return true if the group string offset has been set correctly + */ + boolean setAlgorithm(AlgorithmName alg[]) + { + if (alg != null && alg.length != 0) { + m_algorithm_ = alg; + return true; + } + return false; + } + + /** + * Sets the number of group and size of each group in number of char + * @param count number of groups + * @param size size of group in char + * @return true if group size is set correctly + */ + boolean setGroupCountSize(int count, int size) + { + if (count <= 0 || size <= 0) { + return false; + } + m_groupcount_ = count; + m_groupsize_ = size; + return true; + } + + /** + * Sets the group name data + * @param group index information array + * @param groupstring name information array + * @return false if there is a data error + */ + boolean setGroup(char group[], byte groupstring[]) + { + if (group != null && groupstring != null && group.length > 0 && + groupstring.length > 0) { + m_groupinfo_ = group; + m_groupstring_ = groupstring; + return true; + } + return false; + } + + // private data members ---------------------------------------------- + + /** + * Data used in unames.icu + */ + private char m_tokentable_[]; + private byte m_tokenstring_[]; + private char m_groupinfo_[]; + private byte m_groupstring_[]; + private AlgorithmName m_algorithm_[]; + + /** + * Group use + */ + private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1]; + private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1]; + + /** + * Default name of the name datafile + */ + private static final String NAME_FILE_NAME_ = + "/com/ibm/icu/impl/data/unames.icu"; + /** + * Shift count to retrieve group information + */ + private static final int GROUP_SHIFT_ = 5; + /** + * Mask to retrieve the offset for a particular character within a group + */ + private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1; + /** + * Default buffer size of datafile + */ + private static final int NAME_BUFFER_SIZE_ = 100000; + + /** + * Position of offsethigh in group information array + */ + private static final int OFFSET_HIGH_OFFSET_ = 1; + + /** + * Position of offsetlow in group information array + */ + private static final int OFFSET_LOW_OFFSET_ = 2; + /** + * Double nibble indicator, any nibble > this number has to be combined + * with its following nibble + */ + private static final int SINGLE_NIBBLE_MAX_ = 11; + + /** + * Maximum length of character names (regular & 1.0). + */ + private static int MAX_NAME_LENGTH_ = 0; + /** + * Maximum length of ISO comments. + */ + private static int MAX_ISO_COMMENT_LENGTH_ = 0; + + /** + * Set of chars used in character names (regular & 1.0). + * Chars are platform-dependent (can be EBCDIC). + */ + private int m_nameSet_[] = new int[8]; + /** + * Set of chars used in ISO comments. (regular & 1.0). + * Chars are platform-dependent (can be EBCDIC). + */ + private int m_ISOCommentSet_[] = new int[8]; + /** + * Utility StringBuffer + */ + private StringBuffer m_utilStringBuffer_ = new StringBuffer(); + /** + * Utility int buffer + */ + private int m_utilIntBuffer_[] = new int[2]; + /** + * Maximum ISO comment length + */ + private int m_maxISOCommentLength_; + /** + * Maximum name length + */ + private int m_maxNameLength_; + /** + * Singleton instance + */ + private static UCharacterName INSTANCE_ = null; + /** + * Type names used for extended names + */ + private static final String TYPE_NAMES_[] = {"unassigned", + "uppercase letter", + "lowercase letter", + "titlecase letter", + "modifier letter", + "other letter", + "non spacing mark", + "enclosing mark", + "combining spacing mark", + "decimal digit number", + "letter number", + "other number", + "space separator", + "line separator", + "paragraph separator", + "control", + "format", + "private use area", + "surrogate", + "dash punctuation", + "start punctuation", + "end punctuation", + "connector punctuation", + "other punctuation", + "math symbol", + "currency symbol", + "modifier symbol", + "other symbol", + "initial punctuation", + "final punctuation", + "noncharacter", + "lead surrogate", + "trail surrogate"}; + /** + * Unknown type name + */ + private static final String UNKNOWN_TYPE_NAME_ = "unknown"; + /** + * Not a character type + */ + private static final int NON_CHARACTER_ + = UCharacterCategory.CHAR_CATEGORY_COUNT; + /** + * Lead surrogate type + */ + private static final int LEAD_SURROGATE_ + = UCharacterCategory.CHAR_CATEGORY_COUNT + 1; + /** + * Trail surrogate type + */ + private static final int TRAIL_SURROGATE_ + = UCharacterCategory.CHAR_CATEGORY_COUNT + 2; + /** + * Extended category count + */ + static final int EXTENDED_CATEGORY_ + = UCharacterCategory.CHAR_CATEGORY_COUNT + 3; + + // private constructor ------------------------------------------------ + + /** + *

Protected constructor for use in UCharacter.

+ * @exception IOException thrown when data reading fails + */ + private UCharacterName() throws IOException + { + InputStream i = getClass().getResourceAsStream(NAME_FILE_NAME_); + BufferedInputStream b = new BufferedInputStream(i, + NAME_BUFFER_SIZE_); + UCharacterNameReader reader = new UCharacterNameReader(b); + reader.read(this); + i.close(); + } + + // private methods --------------------------------------------------- + + /** + * Gets the algorithmic name for the argument character + * @param ch character to determine name for + * @param choice name choice + * @return the algorithmic name or null if not found + */ + private String getAlgName(int ch, int choice) + { + // Do not write algorithmic Unicode 1.0 names because Unihan names are + // the same as the modern ones, extension A was only introduced with + // Unicode 3.0, and the Hangul syllable block was moved and changed + // around Unicode 1.1.5. + if (choice != UCharacterNameChoice.UNICODE_10_CHAR_NAME) { + // index in terms integer index + synchronized (m_utilStringBuffer_) { + m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); + + for (int index = m_algorithm_.length - 1; index >= 0; index --) + { + if (m_algorithm_[index].contains(ch)) { + m_algorithm_[index].appendName(ch, m_utilStringBuffer_); + return m_utilStringBuffer_.toString(); + } + } + } + } + return null; + } + + /** + * Getting the character with the tokenized argument name + * @param name of the character + * @return character with the tokenized argument name or -1 if character + * is not found + */ + private synchronized int getGroupChar(String name, int choice) + { + for (int i = 0; i < m_groupcount_; i ++) { + // populating the data set of grouptable + + int startgpstrindex = getGroupLengths(i, m_groupoffsets_, + m_grouplengths_); + + // shift out to function + int result = getGroupChar(startgpstrindex, m_grouplengths_, name, + choice); + if (result != -1) { + return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_) + | result; + } + } + return -1; + } + + /** + * Compares and retrieve character if name is found within the argument + * group + * @param index index where the set of names reside in the group block + * @param length list of lengths of the strings + * @param name character name to search for + * @param choice of either 1.0 or the most current unicode name + * @return relative character in the group which matches name, otherwise if + * not found, -1 will be returned + */ + private int getGroupChar(int index, char length[], String name, + int choice) + { + byte b = 0; + char token; + int len; + int namelen = name.length(); + int nindex; + int count; + + for (int result = 0; result <= LINES_PER_GROUP_; result ++) { + nindex = 0; + len = length[result]; + + if (choice == UCharacterNameChoice.UNICODE_10_CHAR_NAME) { + int oldindex = index; + index += UCharacterUtility.skipByteSubString(m_groupstring_, + index, len, (byte)';'); + len -= (index - oldindex); + } + + // number of tokens is > the length of the name + // write each letter directly, and write a token word per token + for (count = 0; count < len && nindex != -1 && nindex < namelen; + ) { + b = m_groupstring_[index + count]; + count ++; + + if (b >= m_tokentable_.length) { + if (name.charAt(nindex ++) != (b & 0xFF)) { + nindex = -1; + } + } + else { + token = m_tokentable_[b & 0xFF]; + if (token == 0xFFFE) { + // this is a lead byte for a double-byte token + token = m_tokentable_[b << 8 | + (m_groupstring_[index + count] & 0x00ff)]; + count ++; + } + if (token == 0xFFFF) { + if (name.charAt(nindex ++) != (b & 0xFF)) { + nindex = -1; + } + } + else { + // compare token with name + nindex = UCharacterUtility.compareNullTermByteSubString( + name, m_tokenstring_, nindex, token); + } + } + } + + if (namelen == nindex && + (count == len || m_groupstring_[index + count] == ';')) { + return result; + } + + index += len; + } + return -1; + } + + /** + * Binary search for the group strings set that contains the argument Unicode + * code point's most significant bits. + * The return value is always a valid group string set that contain msb. + * If group string set is not found, -1 is returned + * @param ch the code point to look for + * @return group string set index in datatable otherwise -1 is returned if + * group string set is not found + */ + private int getGroupStringIndex(int ch) + { + // gets the msb + int msb = ch >> GROUP_SHIFT_, + end = m_groupcount_, + start, + gindex = 0; + + // binary search for the group of names that contains the one for code + for (start = 0; start < end - 1;) { + gindex = (start + end) >> 1; + if (msb < m_groupinfo_[gindex * m_groupsize_]) { + end = gindex; + } + else { + start = gindex; + } + } + + // return this if it is an exact match + if (msb == m_groupinfo_[start * m_groupsize_]) { + start = start * m_groupsize_; + return UCharacterUtility.toInt( + m_groupinfo_[start + OFFSET_HIGH_OFFSET_], + m_groupinfo_[start + OFFSET_LOW_OFFSET_]); + } + return -1; + } + + /** + * Gets the group name of the character + * @param ch character to get the group name + * @param choice name choice selector to choose a unicode 1.0 or newer name + */ + private String getGroupName(int ch, int choice) + { + // gets the msb + int msb = getCodepointMSB(ch); + int group = getGroup(ch); + + // return this if it is an exact match + if (msb == m_groupinfo_[group * m_groupsize_]) { + int index = getGroupLengths(group, m_groupoffsets_, + m_grouplengths_); + int offset = ch & GROUP_MASK_; + return getGroupName(index + m_groupoffsets_[offset], + m_grouplengths_[offset], choice); + } + + return null; + } + + /** + * Gets the character extended type + * @param ch character to be tested + * @return extended type it is associated with + */ + private static int getType(int ch) + { + if (UCharacterUtility.isNonCharacter(ch)) { + // not a character we return a invalid category count + return NON_CHARACTER_; + } + int result = UCharacter.getType(ch); + if (result == UCharacterCategory.SURROGATE) { + if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { + result = LEAD_SURROGATE_; + } + else { + result = TRAIL_SURROGATE_; + } + } + return result; + } + + /** + * Getting the character with extended name of the form <....>. + * @param name of the character to be found + * @param choice name choice + * @return character associated with the name, -1 if such character is not + * found and -2 if we should continue with the search. + */ + private static int getExtendedChar(String name, int choice) + { + if (name.charAt(0) == '<') { + if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { + int endIndex = name.length() - 1; + if (name.charAt(endIndex) == '>') { + int startIndex = name.lastIndexOf('-'); + if (startIndex >= 0) { // We've got a category. + startIndex ++; + int result = -1; + try { + result = Integer.parseInt( + name.substring(startIndex, endIndex), + 16); + } + catch (NumberFormatException e) { + return -1; + } + // Now validate the category name. We could use a + // binary search, or a trie, if we really wanted to. + String type = name.substring(1, startIndex - 1); + int length = TYPE_NAMES_.length; + for (int i = 0; i < length; ++ i) { + if (type.compareTo(TYPE_NAMES_[i]) == 0) { + if (getType(result) == i) { + return result; + } + break; + } + } + } + } + } + return -1; + } + return -2; + } + + // sets of name characters, maximum name lengths ----------------------- + + /** + * Adds a codepoint into a set of ints. + * Equivalent to SET_ADD. + * @param set set to add to + * @param ch 16 bit char to add + */ + private static void add(int set[], char ch) + { + set[ch >>> 5] |= 1 << (ch & 0x1f); + } + + /** + * Checks if a codepoint is a part of a set of ints. + * Equivalent to SET_CONTAINS. + * @param set set to check in + * @param ch 16 bit char to check + * @return true if codepoint is part of the set, false otherwise + */ + private static boolean contains(int set[], char ch) + { + return (set[ch >>> 5] & (1 << (ch & 0x1f))) != 0; + } + + /** + * Adds all characters of the argument str and gets the length + * Equivalent to calcStringSetLength. + * @param set set to add all chars of str to + * @param str string to add + */ + private static int add(int set[], String str) + { + int result = str.length(); + + for (int i = result - 1; i >= 0; i --) { + add(set, str.charAt(i)); + } + return result; + } + + /** + * Adds all characters of the argument str and gets the length + * Equivalent to calcStringSetLength. + * @param set set to add all chars of str to + * @param str string to add + */ + private static int add(int set[], StringBuffer str) + { + int result = str.length(); + + for (int i = result - 1; i >= 0; i --) { + add(set, str.charAt(i)); + } + return result; + } + + /** + * Adds all algorithmic names into the name set. + * Equivalent to part of calcAlgNameSetsLengths. + * @param maxlength length to compare to + * @return the maximum length of any possible algorithmic name if it is > + * maxlength, otherwise maxlength is returned. + */ + private int addAlgorithmName(int maxlength) + { + int result = 0; + for (int i = m_algorithm_.length - 1; i >= 0; i --) { + result = m_algorithm_[i].add(m_nameSet_, maxlength); + if (result > maxlength) { + maxlength = result; + } + } + return maxlength; + } + + /** + * Adds all extended names into the name set. + * Equivalent to part of calcExtNameSetsLengths. + * @param maxlength length to compare to + * @return the maxlength of any possible extended name. + */ + private int addExtendedName(int maxlength) + { + for (int i = TYPE_NAMES_.length - 1; i >= 0; i --) { + // for each category, count the length of the category name + // plus 9 = + // 2 for <> + // 1 for - + // 6 for most hex digits per code point + int length = 9 + add(m_nameSet_, TYPE_NAMES_[i]); + if (length > maxlength) { + maxlength = length; + } + } + return maxlength; + } + + /** + * Adds names of a group to the argument set. + * Equivalent to calcNameSetLength. + * @param offset of the group name string in byte count + * @param length of the group name string + * @param tokenlength array to store the length of each token + * @param set to add to + * @return the length of the name string and the length of the group + * string parsed + */ + private int[] addGroupName(int offset, int length, byte tokenlength[], + int set[]) + { + int resultnlength = 0; + int resultplength = 0; + while (resultplength < length) { + char b = (char)(m_groupstring_[offset + resultplength] & 0xff); + resultplength ++; + if (b == ';') { + break; + } + + if (b >= m_tokentable_.length) { + add(set, b); // implicit letter + resultnlength ++; + } + else { + char token = m_tokentable_[b & 0x00ff]; + if (token == 0xFFFE) { + // this is a lead byte for a double-byte token + b = (char)(b << 8 | (m_groupstring_[offset + resultplength] + & 0x00ff)); + token = m_tokentable_[b]; + resultplength ++; + } + if (token == 0xFFFF) { + add(set, b); + resultnlength ++; + } + else { + // count token word + // use cached token length + byte tlength = tokenlength[b]; + if (tlength == 0) { + synchronized (m_utilStringBuffer_) { + m_utilStringBuffer_.delete(0, + m_utilStringBuffer_.length()); + UCharacterUtility.getNullTermByteSubString( + m_utilStringBuffer_, m_tokenstring_, + token); + tlength = (byte)add(set, m_utilStringBuffer_); + } + tokenlength[b] = tlength; + } + resultnlength += tlength; + } + } + } + m_utilIntBuffer_[0] = resultnlength; + m_utilIntBuffer_[1] = resultplength; + return m_utilIntBuffer_; + } + + /** + * Adds names of all group to the argument set. + * Sets the data member m_max*Length_. + * Method called only once. + * Equivalent to calcGroupNameSetsLength. + * @param maxlength length to compare to + */ + private void addGroupName(int maxlength) + { + int maxisolength = 0; + char offsets[] = new char[LINES_PER_GROUP_ + 2]; + char lengths[] = new char[LINES_PER_GROUP_ + 2]; + byte tokenlengths[] = new byte[m_tokentable_.length]; + + // enumerate all groups + // for (int i = m_groupcount_ - 1; i >= 0; i --) { + for (int i = 0; i < m_groupcount_ ; i ++) { + int offset = getGroupLengths(i, offsets, lengths); + // enumerate all lines in each group + // for (int linenumber = LINES_PER_GROUP_ - 1; linenumber >= 0; + // linenumber --) { + for (int linenumber = 0; linenumber < LINES_PER_GROUP_; + linenumber ++) { + int lineoffset = offset + offsets[linenumber]; + int length = lengths[linenumber]; + if (length == 0) { + continue; + } + + // read regular name + int parsed[] = addGroupName(lineoffset, length, tokenlengths, + m_nameSet_); + if (parsed[0] > maxlength) { + // 0 for name length + maxlength = parsed[0]; + } + lineoffset += parsed[1]; + if (parsed[1] >= length) { + // 1 for parsed group string length + continue; + } + length -= parsed[1]; + // read Unicode 1.0 name + parsed = addGroupName(lineoffset, length, tokenlengths, + m_nameSet_); + if (parsed[0] > maxlength) { + // 0 for name length + maxlength = parsed[0]; + } + lineoffset += parsed[1]; + if (parsed[1] >= length) { + // 1 for parsed group string length + continue; + } + length -= parsed[1]; + // read ISO comment + parsed = addGroupName(lineoffset, length, tokenlengths, + m_ISOCommentSet_); + if (parsed[1] > maxisolength) { + maxisolength = length; + } + } + } + + // set gMax... - name length last for threading + m_maxISOCommentLength_ = maxisolength; + m_maxNameLength_ = maxlength; + } + + /** + * Sets up the name sets and the calculation of the maximum lengths. + * Equivalent to calcNameSetsLengths. + */ + private boolean initNameSetsLengths() + { + if (m_maxNameLength_ > 0) { + return true; + } + + String extra = "0123456789ABCDEF<>-"; + // set hex digits, used in various names, and <>-, used in extended + // names + for (int i = extra.length() - 1; i >= 0; i --) { + add(m_nameSet_, extra.charAt(i)); + } + + // set sets and lengths from algorithmic names + m_maxNameLength_ = addAlgorithmName(0); + // set sets and lengths from extended names + m_maxNameLength_ = addExtendedName(m_maxNameLength_); + // set sets and lengths from group names, set global maximum values + addGroupName(m_maxNameLength_); + return true; + } + + /** + * Converts the char set cset into a Unicode set uset. + * Equivalent to charSetToUSet. + * @param set Set of 256 bit flags corresponding to a set of chars. + * @param uset USet to receive characters. Existing contents are deleted. + */ + private void convert(int set[], UnicodeSet uset) + { + uset.clear(); + if (!initNameSetsLengths()) { + return; + } + + // build a char string with all chars that are used in character names + for (char c = 255; c > 0; c --) { + if (contains(set, c)) { + uset.add(c); + } + } + } +} diff --git a/icu4j/src/com/ibm/icu/lang/UCharacterNameChoice.java b/icu4j/src/com/ibm/icu/impl/UCharacterNameChoice.java old mode 100755 new mode 100644 similarity index 73% rename from icu4j/src/com/ibm/icu/lang/UCharacterNameChoice.java rename to icu4j/src/com/ibm/icu/impl/UCharacterNameChoice.java index 537dd831216..5b8200f35dc --- a/icu4j/src/com/ibm/icu/lang/UCharacterNameChoice.java +++ b/icu4j/src/com/ibm/icu/impl/UCharacterNameChoice.java @@ -6,13 +6,13 @@ * * $Source: * /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterNameChoiceEnum.java $ -* $Date: 2002/02/16 03:05:57 $ -* $Revision: 1.4 $ +* $Date: 2002/09/19 21:19:04 $ +* $Revision: 1.1 $ * ******************************************************************************* */ -package com.ibm.icu.lang; +package com.ibm.icu.impl; /** * Internal class containing selector constants for the unicode character names. @@ -24,12 +24,13 @@ package com.ibm.icu.lang; * @since oct0600 */ -interface UCharacterNameChoice +public interface UCharacterNameChoice { // public variables ============================================= - static final int U_UNICODE_CHAR_NAME = 0; - static final int U_UNICODE_10_CHAR_NAME = 1; - static final int U_EXTENDED_CHAR_NAME = 2; - static final int U_CHAR_NAME_CHOICE_COUNT = 3; + static final int UNICODE_CHAR_NAME = 0; + static final int UNICODE_10_CHAR_NAME = 1; + static final int EXTENDED_CHAR_NAME = 2; + static final int CHAR_NAME_CHOICE_COUNT = 3; + static final int ISO_COMMENT_ = CHAR_NAME_CHOICE_COUNT; } diff --git a/icu4j/src/com/ibm/icu/lang/UCharacterNameReader.java b/icu4j/src/com/ibm/icu/impl/UCharacterNameReader.java old mode 100755 new mode 100644 similarity index 97% rename from icu4j/src/com/ibm/icu/lang/UCharacterNameReader.java rename to icu4j/src/com/ibm/icu/impl/UCharacterNameReader.java index 22471ba057e..320a2d20722 --- a/icu4j/src/com/ibm/icu/lang/UCharacterNameReader.java +++ b/icu4j/src/com/ibm/icu/impl/UCharacterNameReader.java @@ -4,13 +4,13 @@ * others. All Rights Reserved. * ******************************************************************************* * -* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/Attic/UCharacterNameReader.java,v $ -* $Date: 2002/08/01 19:50:26 $ -* $Revision: 1.11 $ +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/UCharacterNameReader.java,v $ +* $Date: 2002/09/19 21:19:04 $ +* $Revision: 1.1 $ * ******************************************************************************* */ -package com.ibm.icu.lang; +package com.ibm.icu.impl; import java.io.InputStream; import java.io.DataInputStream; diff --git a/icu4j/src/com/ibm/icu/lang/UCharacterUtil.java b/icu4j/src/com/ibm/icu/impl/UCharacterUtility.java old mode 100755 new mode 100644 similarity index 72% rename from icu4j/src/com/ibm/icu/lang/UCharacterUtil.java rename to icu4j/src/com/ibm/icu/impl/UCharacterUtility.java index 88a42ac5277..c68f4bf70c8 --- a/icu4j/src/com/ibm/icu/lang/UCharacterUtil.java +++ b/icu4j/src/com/ibm/icu/impl/UCharacterUtility.java @@ -4,13 +4,13 @@ * others. All Rights Reserved. * ******************************************************************************* * -* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/Attic/UCharacterUtil.java,v $ -* $Date: 2002/07/22 23:28:21 $ -* $Revision: 1.6 $ +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/UCharacterUtility.java,v $ +* $Date: 2002/09/19 21:19:04 $ +* $Revision: 1.1 $ * ******************************************************************************* */ -package com.ibm.icu.lang; +package com.ibm.icu.impl; /** * Internal character utility class for simple data type conversion and String @@ -19,18 +19,26 @@ package com.ibm.icu.lang; * @since sep2900 */ -final class UCharacterUtil +public final class UCharacterUtility { - // constructor ===================================================== - + // public methods ----------------------------------------------------- + /** - * private constructor to avoid initialisation + * Determines if codepoint is a non character + * @param ch codepoint + * @return true if codepoint is a non character false otherwise */ - private UCharacterUtil() + public static boolean isNonCharacter(int ch) { + if ((ch & NON_CHARACTER_SUFFIX_MIN_3_0_) == + NON_CHARACTER_SUFFIX_MIN_3_0_) { + return true; + } + + return ch >= NON_CHARACTER_MIN_3_1_ && ch <= NON_CHARACTER_MAX_3_1_; } - - // protected methods =============================================== + + // package private methods --------------------------------------------- /** * joining 2 chars to form an int @@ -38,7 +46,7 @@ final class UCharacterUtil * @param lsc least significant char * @return int form */ - protected static int toInt(char msc, char lsc) + static int toInt(char msc, char lsc) { return ((msc << 16) | lsc); } @@ -49,7 +57,7 @@ final class UCharacterUtil * @param lsb the least significant byte * @return char form */ - protected static char toChar(byte msb, byte lsb) + static char toChar(byte msb, byte lsb) { return (char)((msb << 8) | (lsb & 0xFF)); } @@ -65,7 +73,7 @@ final class UCharacterUtil * @param index to start substring in byte count * @return the end position of the substring within the character array */ - protected static int getNullTermByteSubString(StringBuffer str, byte[] array, + static int getNullTermByteSubString(StringBuffer str, byte[] array, int index) { byte b = 1; @@ -93,7 +101,7 @@ final class UCharacterUtil * @return the end position of the substring within str if matches otherwise * a -1 */ - protected static int compareNullTermByteSubString(String str, byte[] array, + static int compareNullTermByteSubString(String str, byte[] array, int strindex, int aindex) { byte b = 1; @@ -127,7 +135,7 @@ final class UCharacterUtil * @param skipcount number of null terminated substrings to skip * @return the end position of the substrings within the character array */ - protected static int skipNullTermByteSubString(byte[] array, int index, + static int skipNullTermByteSubString(byte[] array, int index, int skipcount) { byte b; @@ -154,7 +162,7 @@ final class UCharacterUtil * @param skipend value of byte to skip to * @return the number of bytes skipped */ - protected static int skipByteSubString(byte[] array, int index, int length, + static int skipByteSubString(byte[] array, int index, int length, byte skipend) { int result; @@ -172,5 +180,30 @@ final class UCharacterUtil return result; } + + // private data member -------------------------------------------------- + + /** + * Minimum suffix value that indicates if a character is non character. + * Unicode 3.0 non characters + */ + private static final int NON_CHARACTER_SUFFIX_MIN_3_0_ = 0xFFFE; + /** + * New minimum non character in Unicode 3.1 + */ + private static final int NON_CHARACTER_MIN_3_1_ = 0xFDD0; + /** + * New non character range in Unicode 3.1 + */ + private static final int NON_CHARACTER_MAX_3_1_ = 0xFDEF; + + // private constructor -------------------------------------------------- + + /** + * private constructor to avoid initialisation + */ + private UCharacterUtility() + { + } } diff --git a/icu4j/src/com/ibm/icu/lang/UCharacter.java b/icu4j/src/com/ibm/icu/lang/UCharacter.java index 95fe2504a6c..11019ca79a1 100755 --- a/icu4j/src/com/ibm/icu/lang/UCharacter.java +++ b/icu4j/src/com/ibm/icu/lang/UCharacter.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $ -* $Date: 2002/09/11 00:12:39 $ -* $Revision: 1.46 $ +* $Date: 2002/09/19 21:18:14 $ +* $Revision: 1.47 $ * ******************************************************************************* */ @@ -21,6 +21,9 @@ import com.ibm.icu.util.VersionInfo; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.UTF16; import com.ibm.icu.impl.NormalizerImpl; +import com.ibm.icu.impl.UCharacterUtility; +import com.ibm.icu.impl.UCharacterName; +import com.ibm.icu.impl.UCharacterNameChoice; /** *

@@ -842,7 +845,7 @@ public final class UCharacter if (ch <= UTF16.SURROGATE_MAX_VALUE) { return false; } - if (isNonCharacter(ch)) { + if (UCharacterUtility.isNonCharacter(ch)) { return false; } return (ch <= MAX_VALUE); @@ -898,7 +901,7 @@ public final class UCharacter */ public static String getName(int ch) { - return NAME_.getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME); + return NAME_.getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME); } /** @@ -914,7 +917,7 @@ public final class UCharacter public static String getName1_0(int ch) { return NAME_.getName(ch, - UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); + UCharacterNameChoice.UNICODE_10_CHAR_NAME); } /** @@ -937,7 +940,22 @@ public final class UCharacter */ public static String getExtendedName(int ch) { - return NAME_.getName(ch, UCharacterNameChoice.U_EXTENDED_CHAR_NAME); + return NAME_.getName(ch, UCharacterNameChoice.EXTENDED_CHAR_NAME); + } + + /** + * Get the ISO 10646 comment for a character. + * The ISO 10646 comment is an informative field in the Unicode Character + * Database (UnicodeData.txt field 11) and is from the ISO 10646 names list. + * @param ch The code point for which to get the ISO comment. + * It must be 0<=c<=0x10ffff. + * @return The ISO comment, or null if there is no comment for this + * character. + * @draft ICU 2.4 + */ + public static String getISOComment(int ch) + { + return NAME_.getName(ch, UCharacterNameChoice.ISO_COMMENT_); } /** @@ -952,7 +970,7 @@ public final class UCharacter public static int getCharFromName(String name) { return NAME_.getCharFromName( - UCharacterNameChoice.U_UNICODE_CHAR_NAME, name); + UCharacterNameChoice.UNICODE_CHAR_NAME, name); } /** @@ -967,7 +985,7 @@ public final class UCharacter public static int getCharFromName1_0(String name) { return NAME_.getCharFromName( - UCharacterNameChoice.U_UNICODE_10_CHAR_NAME, name); + UCharacterNameChoice.UNICODE_10_CHAR_NAME, name); } /** @@ -992,7 +1010,7 @@ public final class UCharacter public static int getCharFromExtendedName(String name) { return NAME_.getCharFromName( - UCharacterNameChoice.U_EXTENDED_CHAR_NAME, name); + UCharacterNameChoice.EXTENDED_CHAR_NAME, name); } /** @@ -1462,7 +1480,7 @@ public final class UCharacter public static ValueIterator getNameIterator() { return new UCharacterNameIterator(NAME_, - UCharacterNameChoice.U_UNICODE_CHAR_NAME); + UCharacterNameChoice.UNICODE_CHAR_NAME); } /** @@ -1487,7 +1505,7 @@ public final class UCharacter public static ValueIterator getName1_0Iterator() { return new UCharacterNameIterator(NAME_, - UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); + UCharacterNameChoice.UNICODE_10_CHAR_NAME); } /** @@ -1512,7 +1530,7 @@ public final class UCharacter public static ValueIterator getExtendedNameIterator() { return new UCharacterNameIterator(NAME_, - UCharacterNameChoice.U_EXTENDED_CHAR_NAME); + UCharacterNameChoice.EXTENDED_CHAR_NAME); } /** @@ -1616,7 +1634,7 @@ public final class UCharacter { return hasBinaryProperty(ch, UProperty.WHITE_SPACE); } - + // protected data members -------------------------------------------- /** @@ -1629,30 +1647,13 @@ public final class UCharacter { try { - NAME_ = new UCharacterName(); + NAME_ = UCharacterName.getInstance(); } catch (Exception e) { throw new RuntimeException(e.getMessage()); } } - - // protected methods ------------------------------------------------- - - /** - * Determines if codepoint is a non character - * @param ch codepoint - * @return true if codepoint is a non character false otherwise - */ - static boolean isNonCharacter(int ch) - { - if ((ch & NON_CHARACTER_SUFFIX_MIN_3_0_) == - NON_CHARACTER_SUFFIX_MIN_3_0_) { - return true; - } - - return ch >= NON_CHARACTER_MIN_3_1_ && ch <= NON_CHARACTER_MAX_3_1_; - } // private variables ------------------------------------------------- @@ -1692,24 +1693,8 @@ public final class UCharacter /** * Shift 24 bits */ - private static final int SHIFT_24_ = 24; - - /** - * Minimum suffix value that indicates if a character is non character. - * Unicode 3.0 non characters - */ - private static final int NON_CHARACTER_SUFFIX_MIN_3_0_ = 0xFFFE; + private static final int SHIFT_24_ = 24; - /** - * New minimum non character in Unicode 3.1 - */ - private static final int NON_CHARACTER_MIN_3_1_ = 0xFDD0; - - /** - * New non character range in Unicode 3.1 - */ - private static final int NON_CHARACTER_MAX_3_1_ = 0xFDEF; - /** * Decimal radix */ diff --git a/icu4j/src/com/ibm/icu/lang/UCharacterCategory.java b/icu4j/src/com/ibm/icu/lang/UCharacterCategory.java index 9c5ff60bad7..4d86d4e232d 100755 --- a/icu4j/src/com/ibm/icu/lang/UCharacterCategory.java +++ b/icu4j/src/com/ibm/icu/lang/UCharacterCategory.java @@ -6,8 +6,8 @@ * * $Source: * /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterCategory.java $ -* $Date: 2002/09/11 00:12:39 $ -* $Revision: 1.8 $ +* $Date: 2002/09/19 21:18:14 $ +* $Revision: 1.9 $ * ******************************************************************************* */ @@ -250,63 +250,4 @@ public final class UCharacterCategory private UCharacterCategory() { } - - // package private data members -------------------------------------- - - /** - * Not a character type - */ - static final int NON_CHARACTER_ = CHAR_CATEGORY_COUNT; - /** - * Lead surrogate type - */ - static final int LEAD_SURROGATE_ = CHAR_CATEGORY_COUNT + 1; - /** - * Trail surrogate type - */ - static final int TRAIL_SURROGATE_ = CHAR_CATEGORY_COUNT + 2; - /** - * Extended category count - */ - static final int EXTENDED_CATEGORY_ = CHAR_CATEGORY_COUNT + 3; - /** - * Type names used for extended names - */ - static final String TYPE_NAMES_[] = {"unassigned", - "uppercase letter", - "lowercase letter", - "titlecase letter", - "modifier letter", - "other letter", - "non spacing mark", - "enclosing mark", - "combining spacing mark", - "decimal digit number", - "letter number", - "other number", - "space separator", - "line separator", - "paragraph separator", - "control", - "format", - "private use area", - "surrogate", - "dash punctuation", - "start punctuation", - "end punctuation", - "connector punctuation", - "other punctuation", - "math symbol", - "currency symbol", - "modifier symbol", - "other symbol", - "initial punctuation", - "final punctuation", - "noncharacter", - "lead surrogate", - "trail surrogate"}; - /** - * Unknown type name - */ - static final String UNKNOWN_TYPE_NAME_ = "unknown"; } diff --git a/icu4j/src/com/ibm/icu/lang/UCharacterName.java b/icu4j/src/com/ibm/icu/lang/UCharacterName.java deleted file mode 100755 index 208c624a433..00000000000 --- a/icu4j/src/com/ibm/icu/lang/UCharacterName.java +++ /dev/null @@ -1,1181 +0,0 @@ -/** -******************************************************************************* -* Copyright (C) 1996-2001, International Business Machines Corporation and * -* others. All Rights Reserved. * -******************************************************************************* -* -* $Source: -* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterName.java $ -* $Date: 2002/07/30 02:38:11 $ -* $Revision: 1.17 $ -* -******************************************************************************* -*/ -package com.ibm.icu.lang; - -import java.io.InputStream; -import java.io.BufferedInputStream; -import java.io.IOException; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.text.UTF16; - -/** -* Internal class to manage character names. -* Since data in UCharacterNameDB is stored -* in an array of char, by default indexes used in this class is refering to -* a 2 byte count, unless otherwise stated. Cases where the index is refering -* to a byte count, the index is halved and depending on whether the index is -* even or odd, the MSB or LSB of the result char at the halved index is -* returned. For indexes to an array of int, the index is multiplied by 2, -* result char at the multiplied index and its following char is returned as an -* int. -* UCharacter acts as a public facade for this class -* Note : 0 - 0x1F are control characters without names in Unicode 3.0 -* Information on parsing of the binary data is located at -* -* ReadMe -* @author Syn Wee Quek -* @since nov0700 -*/ - -final class UCharacterName -{ - // public methods ---------------------------------------------------- - - /** - * toString method for printing - */ - public String toString() - { - StringBuffer result = new StringBuffer("names content \n"); - /*result.append(super.toString()); - result.append('\n'); - result.append("token string offset "); - result.append(m_tokenstringoffset_); - result.append("\n"); - result.append("group offset "); - result.append(m_groupsoffset_); - result.append("\n"); - result.append("group string offset "); - result.append(m_groupstringoffset_); - result.append("\n"); - result.append("alg names offset "); - result.append(m_algnamesoffset_); - result.append("\n"); - */ - return result.toString(); - } - - // package protected inner class ------------------------------------- - - /** - * Algorithmic name class - */ - static final class AlgorithmName - { - // protected data members ---------------------------------------- - - /** - * Constant type value of the different AlgorithmName - */ - protected static final int TYPE_0_ = 0; - protected static final int TYPE_1_ = 1; - - // protected constructors ---------------------------------------- - - /** - * Constructor - */ - protected AlgorithmName() - { - } - - // protected methods --------------------------------------------- - - /** - * Sets the information for accessing the algorithmic names - * @param rangestart starting code point that lies within this name group - * @param rangeend end code point that lies within this name group - * @param type algorithm type. There's 2 kinds of algorithmic type. First - * which uses code point as part of its name and the other uses - * variant postfix strings - * @param variant algorithmic variant - * @return true if values are valid - */ - protected boolean setInfo(int rangestart, int rangeend, byte type, - byte variant) - { - if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend - && rangeend <= UCharacter.MAX_VALUE && - (type == TYPE_0_ || type == TYPE_1_)) { - m_rangestart_ = rangestart; - m_rangeend_ = rangeend; - m_type_ = type; - m_variant_ = variant; - return true; - } - return false; - } - - /** - * Sets the factor data - * @param array of factor - * @return true if factors are valid - */ - protected boolean setFactor(char factor[]) - { - if (factor.length == m_variant_) { - m_factor_ = factor; - return true; - } - return false; - } - - /** - * Sets the name prefix - * @param prefix - * @return true if prefix is set - */ - protected boolean setPrefix(String prefix) - { - if (prefix != null && prefix.length() > 0) { - m_prefix_ = prefix; - return true; - } - return false; - } - - /** - * Sets the variant factorized name data - * @param string variant factorized name data - * @return true if values are set - */ - protected boolean setFactorString(byte string[]) - { - // factor and variant string can be empty for things like - // hanggul code points - m_factorstring_ = string; - return true; - } - - /** - * Checks if code point lies in Algorithm object at index - * @param ch code point - */ - protected boolean contains(int ch) - { - return m_rangestart_ <= ch && ch <= m_rangeend_; - } - - /** - * Appends algorithm name of code point into StringBuffer. - * Note this method does not check for validity of code point in Algorithm, - * result is undefined if code point does not belong in Algorithm. - * @param ch code point - * @param str StringBuffer to append to - */ - protected void appendName(int ch, StringBuffer str) - { - str.append(m_prefix_); - switch (m_type_) - { - case TYPE_0_: - // prefix followed by hex digits indicating variants - Utility.hex(ch, m_variant_, str); - break; - case TYPE_1_: - // prefix followed by factorized-elements - int offset = ch - m_rangestart_; - int indexes[] = new int[m_variant_]; - int factor; - - // write elements according to the factors - // the factorized elements are determined by modulo - // arithmetic - for (int i = m_variant_ - 1; i > 0; i --) - { - factor = m_factor_[i] & 0x00FF; - indexes[i] = offset % factor; - offset /= factor; - } - - // we don't need to calculate the last modulus because - // start <= code <= end guarantees here that - // code <= factors[0] - indexes[0] = offset; - - // joining up the factorized strings - String s[] = getFactorString(indexes); - if (s != null && s.length > 0) - { - int size = s.length; - for (int i = 0; i < size; i ++) - str.append(s[i]); - } - break; - } - } - - /** - * Gets the character for the argument algorithmic name - * @return the algorithmic char or -1 otherwise. - */ - protected int getAlgorithmChar(String name) - { - int prefixlen = m_prefix_.length(); - if (name.length() < prefixlen || - !m_prefix_.equals(name.substring(0, prefixlen))) { - return -1; - } - - switch (m_type_) - { - case TYPE_0_ : - try - { - int result = Integer.parseInt(name.substring(prefixlen), - 16); - // does it fit into the range? - if (m_rangestart_ <= result && result <= m_rangeend_) { - return result; - } - } - catch (NumberFormatException e) - { - return -1; - } - break; - case TYPE_1_ : - // repetitative suffix name comparison done here - // offset is the character code - start - for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++) - { - int offset = ch - m_rangestart_; - int indexes[] = new int[m_variant_]; - int factor; - - // write elements according to the factors - // the factorized elements are determined by modulo - // arithmetic - for (int i = m_variant_ - 1; i > 0; i --) - { - factor = m_factor_[i] & 0x00FF; - indexes[i] = offset % factor; - offset /= factor; - } - - // we don't need to calculate the last modulus - // because start <= code <= end guarantees here that - // code <= factors[0] - indexes[0] = offset; - - // joining up the factorized strings - if (compareFactorString(indexes, name, prefixlen)) { - return ch; - } - } - } - - return -1; - } - - // private data members ------------------------------------------ - - /** - * Algorithmic data information - */ - private int m_rangestart_; - private int m_rangeend_; - private byte m_type_; - private byte m_variant_; - private char m_factor_[]; - private String m_prefix_; - private byte m_factorstring_[]; - - // private methods ----------------------------------------------- - - /** - * Gets the indexth string in each of the argument factor block - * @param index array with each index corresponding to each factor block - * @return array of indexth factor string in factor block - */ - private String[] getFactorString(int index[]) - { - int size = m_factor_.length; - if (index == null || index.length != size) { - return null; - } - - String result[] = new String[size]; - StringBuffer str = new StringBuffer(); - int count = 0; - int factor; - size --; - for (int i = 0; i <= size; i ++) { - factor = m_factor_[i]; - count = UCharacterUtil.skipNullTermByteSubString( - m_factorstring_, count, index[i]); - count = UCharacterUtil.getNullTermByteSubString( - str, m_factorstring_, count); - if (i != size) { - count = UCharacterUtil.skipNullTermByteSubString( - m_factorstring_, count, - factor - index[i] - 1); - } - result[i] = str.toString(); - str.delete(0, str.length()); - } - return result; - } - - /** - * Compares the indexth string in each of the argument factor block with - * the argument string - * @param index array with each index corresponding to each factor block - * @param str string to compare with - * @param offset of str to start comparison - * @return true if string matches - */ - private boolean compareFactorString(int index[], String str, - int offset) - { - int size = m_factor_.length; - if (index == null || index.length != size) - return false; - - int count = 0; - int strcount = offset; - int factor; - size --; - for (int i = 0; i <= size; i ++) - { - factor = m_factor_[i]; - count = UCharacterUtil.skipNullTermByteSubString( - m_factorstring_, count, index[i]); - strcount = UCharacterUtil.compareNullTermByteSubString(str, - m_factorstring_, strcount, count); - if (strcount < 0) { - return false; - } - - if (i != size) { - count = UCharacterUtil.skipNullTermByteSubString( - m_factorstring_, count, factor - index[i]); - } - } - if (strcount != str.length()) { - return false; - } - return true; - } - } - - // protected data members -------------------------------------------- - - /** - * Maximum number of groups - */ - protected int m_groupcount_ = 0; - /** - * Size of each groups - */ - protected int m_groupsize_ = 0; - /** - * Number of lines per group - * 1 << GROUP_SHIFT_ - */ - protected static final int LINES_PER_GROUP_ = 1 << 5; - - // protected constructor --------------------------------------------- - - /** - *

Protected constructor for use in UCharacter.

- * @exception IOException thrown when data reading fails - */ - protected UCharacterName() throws IOException - { - InputStream i = getClass().getResourceAsStream(NAME_FILE_NAME_); - BufferedInputStream b = new BufferedInputStream(i, - NAME_BUFFER_SIZE_); - UCharacterNameReader reader = new UCharacterNameReader(b); - reader.read(this); - i.close(); - } - - // protected methods ------------------------------------------------- - - /** - * Retrieve the name of a Unicode code point. - * Depending on choice, the character name written into the - * buffer is the "modern" name or the name that was defined in Unicode - * version 1.0. - * The name contains only "invariant" characters - * like A-Z, 0-9, space, and '-'. - * - * @param ch the code point for which to get the name. - * @param choice Selector for which name to get. - * @return if code point is above 0x1fff, null is returned - */ - protected String getName(int ch, int choice) - { - if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE || - choice >= UCharacterNameChoice.U_CHAR_NAME_CHOICE_COUNT) { - return null; - } - - String result = null; - - result = getAlgName(ch, choice); - - // getting normal character name - if (result == null || result.length() == 0) { - if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { - result = getExtendedName(ch); - } else { - result = getGroupName(ch, choice); - } - } - - return result; - } - - /** - * Find a character by its name and return its code point value - * @param character name - * @param choice selector to indicate if argument name is a Unicode 1.0 - * or the most current version - * @return code point - */ - protected int getCharFromName(int choice, String name) - { - // checks for illegal arguments - if (choice >= UCharacterNameChoice.U_CHAR_NAME_CHOICE_COUNT || - name == null || name.length() == 0) { - return -1; - } - - // try extended names first - int result = getExtendedChar(name.toLowerCase(), choice); - if (result >= -1) { - return result; - } - - String upperCaseName = name.toUpperCase(); - // try algorithmic names first, if fails then try group names - // int result = getAlgorithmChar(choice, uppercasename); - - if (choice != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { - int count = 0; - if (m_algorithm_ != null) { - count = m_algorithm_.length; - } - for (count --; count >= 0; count --) { - result = m_algorithm_[count].getAlgorithmChar(upperCaseName); - if (result >= 0) { - return result; - } - } - } - - if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { - result = getGroupChar(upperCaseName, - UCharacterNameChoice.U_UNICODE_CHAR_NAME); - if (result == -1) { - result = getGroupChar(upperCaseName, - UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); - } - } - else { - result = getGroupChar(upperCaseName, choice); - } - return result; - } - - /** - * Sets the token data - * @param token array of tokens - * @param tokenstring array of string values of the tokens - * @return false if there is a data error - */ - protected boolean setToken(char token[], byte tokenstring[]) - { - if (token != null && tokenstring != null && token.length > 0 && - tokenstring.length > 0) { - m_tokentable_ = token; - m_tokenstring_ = tokenstring; - return true; - } - return false; - } - - /** - * Set the algorithm name information array - * @param algorithm information array - * @return true if the group string offset has been set correctly - */ - protected boolean setAlgorithm(AlgorithmName alg[]) - { - if (alg != null && alg.length != 0) { - m_algorithm_ = alg; - return true; - } - return false; - } - - /** - * Sets the number of group and size of each group in number of char - * @param count number of groups - * @param size size of group in char - * @return true if group size is set correctly - */ - protected boolean setGroupCountSize(int count, int size) - { - if (count <= 0 || size <= 0) { - return false; - } - m_groupcount_ = count; - m_groupsize_ = size; - return true; - } - - /** - * Sets the group name data - * @param group index information array - * @param groupstring name information array - * @return false if there is a data error - */ - protected boolean setGroup(char group[], byte groupstring[]) - { - if (group != null && groupstring != null && group.length > 0 && - groupstring.length > 0) { - m_groupinfo_ = group; - m_groupstring_ = groupstring; - return true; - } - return false; - } - - /** - * Reads a block of compressed lengths of 32 strings and expands them into - * offsets and lengths for each string. Lengths are stored with a - * variable-width encoding in consecutive nibbles: - * If a nibble<0xc, then it is the length itself (0 = empty string). - * If a nibble>=0xc, then it forms a length value with the following - * nibble. - * The offsets and lengths arrays must be at least 33 (one more) long - * because there is no check here at the end if the last nibble is still - * used. - * @param index of group string object in array - * @param offsets array to store the value of the string offsets - * @param lengths array to store the value of the string length - * @return next index of the data string immediately after the lengths - * in terms of byte address - */ - protected int getGroupLengths(int index, char offsets[], char lengths[]) - { - char length = 0xffff; - byte b = 0, - n = 0; - int shift; - index = index * m_groupsize_; // byte count offsets of group strings - int stringoffset = UCharacterUtil.toInt( - m_groupinfo_[index + OFFSET_HIGH_OFFSET_], - m_groupinfo_[index + OFFSET_LOW_OFFSET_]); - - offsets[0] = 0; - - // all 32 lengths must be read to get the offset of the first group - // string - for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) { - b = m_groupstring_[stringoffset]; - shift = 4; - - while (shift >= 0) { - // getting nibble - n = (byte)((b >> shift) & 0x0F); - if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) { - length = (char)((n - 12) << 4); - } - else { - if (length != 0xffff) { - lengths[i] = (char)((length | n) + 12); - } - else { - lengths[i] = (char)n; - } - - if (i < LINES_PER_GROUP_) { - offsets[i + 1] = (char)(offsets[i] + lengths[i]); - } - - length = 0xffff; - i ++; - } - - shift -= 4; - } - } - return stringoffset; - } - - /** - * Gets the name of the argument group index - * @param index of the group name string in byte count - * @param length of the group name string - * @param choice of Unicode 1.0 name or the most current name - * @return name of the group - */ - protected String getGroupName(int index, int length, int choice) - { - if (choice == UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { - int oldindex = index; - index += UCharacterUtil.skipByteSubString(m_groupstring_, - index, length, (byte)';'); - length -= (index - oldindex); - } - - StringBuffer s = new StringBuffer(); - byte b; - char token; - for (int i = 0; i < length;) { - b = m_groupstring_[index + i]; - i ++; - - if (b >= m_tokentable_.length) { - if (b == ';') { - break; - } - s.append(b); // implicit letter - } - else { - token = m_tokentable_[b & 0x00ff]; - if (token == 0xFFFE) { - // this is a lead byte for a double-byte token - token = m_tokentable_[b << 8 | - (m_groupstring_[index + i] & 0x00ff)]; - i ++; - } - if (token == 0xFFFF) { - if (b == ';') { - // skip the semicolon if we are seeking extended - // names and there was no 2.0 name but there - // is a 1.0 name. - if (s.length() == 0 && choice == - UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { - continue; - } - break; - } - s.append((char)(b & 0x00ff)); // explicit letter - } - else { // write token word - UCharacterUtil.getNullTermByteSubString(s, - m_tokenstring_, token); - } - } - } - - if (s.length() == 0) { - return null; - } - return s.toString(); - } - - /** - * Retrieves the extended name - */ - protected String getExtendedName(int ch) - { - String result = getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME); - if (result == null) { - if (getType(ch) == UCharacterCategory.CONTROL) { - result = getName(ch, - UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); - } - if (result == null) { - result = getExtendedOr10Name(ch); - } - } - return result; - } - - /** - * Gets the group index for the codepoint, or the group before it. - * @param codepoint - * @return group index containing codepoint or the group before it. - */ - protected int getGroup(int codepoint) - { - int endGroup = m_groupcount_; - int msb = getCodepointMSB(codepoint); - int result = 0; - // binary search for the group of names that contains the one for - // code - // find the group that contains codepoint, or the highest before it - while (result < endGroup - 1) { - int gindex = (result + endGroup) >> 1; - if (msb < getGroupMSB(gindex)) { - endGroup = gindex; - } - else { - result = gindex; - } - } - return result; - } - - /** - * Gets the extended and 1.0 name when the most current unicode names - * fail - * @param ch codepoint - * @return name of codepoint extended or 1.0 - */ - protected String getExtendedOr10Name(int ch) - { - String result = null; - if (getType(ch) == UCharacterCategory.CONTROL) { - result = getName(ch, - UCharacterNameChoice.U_UNICODE_10_CHAR_NAME); - } - if (result == null) { - int type = getType(ch); - // Return unknown if the table of names above is not up to - // date. - if (type >= UCharacterCategory.TYPE_NAMES_.length) { - result = UCharacterCategory.UNKNOWN_TYPE_NAME_; - } - else { - result = UCharacterCategory.TYPE_NAMES_[type]; - } - StringBuffer tempResult = new StringBuffer(result); - tempResult.insert(0, '<'); - tempResult.append('-'); - String chStr = Integer.toHexString(ch).toUpperCase(); - int zeros = 4 - chStr.length(); - while (zeros > 0) { - tempResult.append('0'); - zeros --; - } - tempResult.append(chStr); - tempResult.append('>'); - result = tempResult.toString(); - } - return result; - } - - // these are all UCharacterNameIterator use methods ------------------- - - /** - * Gets the MSB from the group index - * @param gindex group index - * @return the MSB of the group if gindex is valid, -1 otherwise - */ - protected int getGroupMSB(int gindex) - { - if (gindex >= m_groupcount_) { - return -1; - } - return m_groupinfo_[gindex * m_groupsize_]; - } - - /** - * Gets the MSB of the codepoint - * @param codepoint - * @return the MSB of the codepoint - */ - protected int getCodepointMSB(int codepoint) - { - return codepoint >> GROUP_SHIFT_; - } - - /** - * Gets the maximum codepoint + 1 of the group - * @param msb most significant byte of the group - * @return limit codepoint of the group - */ - protected int getGroupLimit(int msb) - { - return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_; - } - - /** - * Gets the minimum codepoint of the group - * @param msb most significant byte of the group - * @return minimum codepoint of the group - */ - protected int getGroupMin(int msb) - { - return msb << GROUP_SHIFT_; - } - - /** - * Gets the offset to a group - * @param codepoint - * @return offset to a group - */ - protected int getGroupOffset(int codepoint) - { - return codepoint & GROUP_MASK_; - } - - /** - * Gets the minimum codepoint of a group - * @param codepoint - * @return minimum codepoint in the group which codepoint belongs to - */ - protected int getGroupMinFromCodepoint(int codepoint) - { - return codepoint & ~GROUP_MASK_; - } - - /** - * Get the Algorithm range length - * @return Algorithm range length - */ - protected int getAlgorithmLength() - { - return m_algorithm_.length; - } - - /** - * Gets the start of the range - * @param index algorithm index - * @return algorithm range start - */ - protected int getAlgorithmStart(int index) - { - return m_algorithm_[index].m_rangestart_; - } - - /** - * Gets the end of the range - * @param index algorithm index - * @return algorithm range end - */ - protected int getAlgorithmEnd(int index) - { - return m_algorithm_[index].m_rangeend_; - } - - /** - * Gets the Algorithmic name of the codepoint - * @param index algorithmic range index - * @param codepoint - * @return algorithmic name of codepoint - */ - protected String getAlgorithmName(int index, int codepoint) - { - StringBuffer result = new StringBuffer(); - m_algorithm_[index].appendName(codepoint, result); - return result.toString(); - } - - - // private data members ---------------------------------------------- - - /** - * Data used in unames.icu - */ - private char m_tokentable_[]; - private byte m_tokenstring_[]; - private char m_groupinfo_[]; - private byte m_groupstring_[]; - private AlgorithmName m_algorithm_[]; - - /** - * Group use - */ - private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1]; - private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1]; - - /** - * Default name of the name datafile - */ - private static final String NAME_FILE_NAME_ = - "/com/ibm/icu/impl/data/unames.icu"; - /** - * Shift count to retrieve group information - */ - private static final int GROUP_SHIFT_ = 5; - /** - * Mask to retrieve the offset for a particular character within a group - */ - private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1; - /** - * Default buffer size of datafile - */ - private static final int NAME_BUFFER_SIZE_ = 100000; - - /** - * Position of offsethigh in group information array - */ - private static final int OFFSET_HIGH_OFFSET_ = 1; - - /** - * Position of offsetlow in group information array - */ - private static final int OFFSET_LOW_OFFSET_ = 2; - /** - * Double nibble indicator, any nibble > this number has to be combined - * with its following nibble - */ - private static final int SINGLE_NIBBLE_MAX_ = 11; - - - // private methods --------------------------------------------------- - - /** - * Gets the algorithmic name for the argument character - * @param ch character to determine name for - * @param choice name choice - * @return the algorithmic name or null if not found - */ - private String getAlgName(int ch, int choice) - { - // Do not write algorithmic Unicode 1.0 names because Unihan names are - // the same as the modern ones, extension A was only introduced with - // Unicode 3.0, and the Hangul syllable block was moved and changed - // around Unicode 1.1.5. - if (choice != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { - // index in terms integer index - StringBuffer s = new StringBuffer(); - - for (int index = m_algorithm_.length - 1; index >= 0; index --) { - if (m_algorithm_[index].contains(ch)) { - m_algorithm_[index].appendName(ch, s); - return s.toString(); - } - } - } - return null; - } - - /** - * Getting the character with the tokenized argument name - * @param name of the character - * @return character with the tokenized argument name or -1 if character - * is not found - */ - private synchronized int getGroupChar(String name, int choice) - { - for (int i = 0; i < m_groupcount_; i ++) { - // populating the data set of grouptable - - int startgpstrindex = getGroupLengths(i, m_groupoffsets_, - m_grouplengths_); - - // shift out to function - int result = getGroupChar(startgpstrindex, m_grouplengths_, name, - choice); - if (result != -1) { - return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_) - | result; - } - } - return -1; - } - - /** - * Compares and retrieve character if name is found within the argument - * group - * @param index index where the set of names reside in the group block - * @param length list of lengths of the strings - * @param name character name to search for - * @param choice of either 1.0 or the most current unicode name - * @return relative character in the group which matches name, otherwise if - * not found, -1 will be returned - */ - private int getGroupChar(int index, char length[], String name, - int choice) - { - byte b = 0; - char token; - int len; - int namelen = name.length(); - int nindex; - int count; - - for (int result = 0; result <= LINES_PER_GROUP_; result ++) { - nindex = 0; - len = length[result]; - - if (choice == UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { - int oldindex = index; - index += UCharacterUtil.skipByteSubString(m_groupstring_, - index, len, (byte)';'); - len -= (index - oldindex); - } - - // number of tokens is > the length of the name - // write each letter directly, and write a token word per token - for (count = 0; count < len && nindex != -1 && nindex < namelen; - ) { - b = m_groupstring_[index + count]; - count ++; - - if (b >= m_tokentable_.length) { - if (name.charAt(nindex ++) != (b & 0xFF)) { - nindex = -1; - } - } - else { - token = m_tokentable_[b & 0xFF]; - if (token == 0xFFFE) { - // this is a lead byte for a double-byte token - token = m_tokentable_[b << 8 | - (m_groupstring_[index + count] & 0x00ff)]; - count ++; - } - if (token == 0xFFFF) { - if (name.charAt(nindex ++) != (b & 0xFF)) { - nindex = -1; - } - } - else { - // compare token with name - nindex = UCharacterUtil.compareNullTermByteSubString( - name, m_tokenstring_, nindex, token); - } - } - } - - if (namelen == nindex && - (count == len || m_groupstring_[index + count] == ';')) { - return result; - } - - index += len; - } - return -1; - } - - /** - * Binary search for the group strings set that contains the argument Unicode - * code point's most significant bits. - * The return value is always a valid group string set that contain msb. - * If group string set is not found, -1 is returned - * @param ch the code point to look for - * @return group string set index in datatable otherwise -1 is returned if - * group string set is not found - */ - private int getGroupStringIndex(int ch) - { - // gets the msb - int msb = ch >> GROUP_SHIFT_, - end = m_groupcount_, - start, - gindex = 0; - - // binary search for the group of names that contains the one for code - for (start = 0; start < end - 1;) { - gindex = (start + end) >> 1; - if (msb < m_groupinfo_[gindex * m_groupsize_]) { - end = gindex; - } - else { - start = gindex; - } - } - - // return this if it is an exact match - if (msb == m_groupinfo_[start * m_groupsize_]) { - start = start * m_groupsize_; - return UCharacterUtil.toInt( - m_groupinfo_[start + OFFSET_HIGH_OFFSET_], - m_groupinfo_[start + OFFSET_LOW_OFFSET_]); - } - return -1; - } - - /** - * Gets the group name of the character - * @param ch character to get the group name - * @param choice name choice selector to choose a unicode 1.0 or newer name - */ - private synchronized String getGroupName(int ch, int choice) - { - // gets the msb - int msb = getCodepointMSB(ch); - int group = getGroup(ch); - - // return this if it is an exact match - if (msb == m_groupinfo_[group * m_groupsize_]) { - int index = getGroupLengths(group, m_groupoffsets_, - m_grouplengths_); - int offset = ch & GROUP_MASK_; - return getGroupName(index + m_groupoffsets_[offset], - m_grouplengths_[offset], choice); - } - - return null; - } - - /** - * Gets the character extended type - * @param ch character to be tested - * @return extended type it is associated with - */ - private int getType(int ch) - { - if (UCharacter.isNonCharacter(ch)) { - // not a character we return a invalid category count - return UCharacterCategory.NON_CHARACTER_; - } - int result = UCharacter.getType(ch); - if (result == UCharacterCategory.SURROGATE) { - if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { - result = UCharacterCategory.LEAD_SURROGATE_; - } - else { - result = UCharacterCategory.TRAIL_SURROGATE_; - } - } - return result; - } - - /** - * Getting the character with extended name of the form <....>. - * @param name of the character to be found - * @param choice name choice - * @return character associated with the name, -1 if such character is not - * found and -2 if we should continue with the search. - */ - private int getExtendedChar(String name, int choice) - { - if (name.charAt(0) == '<') { - if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { - int endIndex = name.length() - 1; - if (name.charAt(endIndex) == '>') { - int startIndex = name.lastIndexOf('-'); - if (startIndex >= 0) { // We've got a category. - startIndex ++; - int result = -1; - try { - result = Integer.parseInt( - name.substring(startIndex, endIndex), - 16); - } - catch (NumberFormatException e) { - return -1; - } - // Now validate the category name. We could use a - // binary search, or a trie, if we really wanted to. - String type = name.substring(1, startIndex - 1); - int length = UCharacterCategory.TYPE_NAMES_.length; - for (int i = 0; i < length; ++ i) { - if (type.compareTo( - UCharacterCategory.TYPE_NAMES_[i]) == 0) { - if (getType(result) == i) { - return result; - } - break; - } - } - } - } - } - return -1; - } - return -2; - } -} diff --git a/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java b/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java index 796c98e9383..9146b279a4e 100644 --- a/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java +++ b/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java @@ -5,8 +5,8 @@ ****************************************************************************** * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java,v $ -* $Date: 2002/04/05 01:38:15 $ -* $Revision: 1.3 $ +* $Date: 2002/09/19 21:18:14 $ +* $Revision: 1.4 $ * ****************************************************************************** */ @@ -14,6 +14,8 @@ package com.ibm.icu.lang; import com.ibm.icu.util.ValueIterator; +import com.ibm.icu.impl.UCharacterName; +import com.ibm.icu.impl.UCharacterNameChoice; /** *

Class enabling iteration of the codepoints and their names.

@@ -43,7 +45,7 @@ class UCharacterNameIterator implements ValueIterator return false; } - if (m_choice_ != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) { + if (m_choice_ != UCharacterNameChoice.UNICODE_10_CHAR_NAME) { int length = m_name_.getAlgorithmLength(); if (m_algorithmIndex_ < length) { while (m_algorithmIndex_ < length) { @@ -97,7 +99,7 @@ class UCharacterNameIterator implements ValueIterator m_current_ ++; return true; } - else if (m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { + else if (m_choice_ == UCharacterNameChoice.EXTENDED_CHAR_NAME) { if (!iterateExtended(element, m_limit_)) { m_current_ ++; return true; @@ -238,7 +240,7 @@ class UCharacterNameIterator implements ValueIterator index + GROUP_OFFSETS_[offset], GROUP_LENGTHS_[offset], m_choice_); if ((name == null || name.length() == 0) && - m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { + m_choice_ == UCharacterNameChoice.EXTENDED_CHAR_NAME) { name = m_name_.getExtendedName(m_current_); } if (name != null && name.length() > 0) { @@ -297,7 +299,7 @@ class UCharacterNameIterator implements ValueIterator if (gMIN > limit) { gMIN = limit; } - if (m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) { + if (m_choice_ == UCharacterNameChoice.EXTENDED_CHAR_NAME) { if (!iterateExtended(result, gMIN)) { return false; }