New character name iteration

X-SVN-Rev: 7913
This commit is contained in:
Syn Wee Quek 2002-03-08 02:04:00 +00:00
parent 2868b2a4d6
commit 51df46827d
5 changed files with 937 additions and 283 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterTest.java,v $
* $Date: 2002/03/02 02:04:07 $
* $Revision: 1.30 $
* $Date: 2002/03/08 02:03:16 $
* $Revision: 1.31 $
*
*******************************************************************************
*/
@ -24,6 +24,7 @@ import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UCharacterDirection;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.ValueIterator;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.text.BreakIterator;
@ -54,6 +55,21 @@ public final class UCharacterTest extends TestFmwk
// public methods ================================================
public static void main(String[] arg)
{
try
{
UCharacterTest test = new UCharacterTest();
UCharacter.getName1_0(0x1d18b);
test.TestNameIteration();
//test.run(arg);
}
catch (Exception e)
{
e.printStackTrace();
}
}
/**
* Testing the uppercase and lowercase function of UCharacter
*/
@ -635,8 +651,7 @@ public final class UCharacterTest extends TestFmwk
errln(
"FAIL: 'LATin smALl letTER A' should result in character U+0061");
}
// extra testing different from icu
for (int i = UCharacter.MIN_VALUE; i < UCharacter.MAX_VALUE; i ++)
{
@ -650,6 +665,123 @@ public final class UCharacterTest extends TestFmwk
}
}
/**
* Testing name iteration
*/
public void TestNameIteration()
{
ValueIterator iterator = UCharacter.getNameIterator();
ValueIterator.Element element = new ValueIterator.Element();
ValueIterator.Element old = new ValueIterator.Element();
// testing subrange
iterator.setRange(0xF, 0x45);
while (iterator.next(element)) {
if (element.integer <= old.integer) {
errln("FAIL next returned a less codepoint \\u" +
Integer.toHexString(element.integer) + " than \\u" +
Integer.toHexString(old.integer));
break;
}
if (!UCharacter.getName(element.integer).equals(element.value)) {
errln("FAIL next codepoint \\u" +
Integer.toHexString(element.integer) +
" does not have the expected name " +
UCharacter.getName(element.integer) +
" instead have the name " + (String)element.value);
break;
}
old.integer = element.integer;
}
iterator.reset();
iterator.next(element);
if (element.integer != 0x20) {
errln("FAIL reset in iterator");
}
iterator.setRange(0, 0x110000);
old.integer = 0;
while (iterator.next(element)) {
if (element.integer != 0 && element.integer <= old.integer) {
errln("FAIL next returned a less codepoint \\u" +
Integer.toHexString(element.integer) + " than \\u" +
Integer.toHexString(old.integer));
break;
}
if (!UCharacter.getName(element.integer).equals(element.value)) {
errln("FAIL next codepoint \\u" +
Integer.toHexString(element.integer) +
" does not have the expected name " +
UCharacter.getName(element.integer) +
" instead have the name " + (String)element.value);
break;
}
for (int i = old.integer + 1; i < element.integer; i ++) {
if (UCharacter.getName(i) != null) {
errln("FAIL between codepoints are not null \\u" +
Integer.toHexString(old.integer) + " and " +
Integer.toHexString(element.integer) + " has " +
Integer.toHexString(i) + " with a name " +
UCharacter.getName(i));
break;
}
}
old.integer = element.integer;
}
iterator = UCharacter.getExtendedNameIterator();
old.integer = 0;
while (iterator.next(element)) {
if (element.integer != 0 && element.integer != old.integer) {
errln("FAIL next returned a codepoint \\u" +
Integer.toHexString(element.integer) +
" different from \\u" +
Integer.toHexString(old.integer));
break;
}
if (!UCharacter.getExtendedName(element.integer).equals(
element.value)) {
errln("FAIL next codepoint \\u" +
Integer.toHexString(element.integer) + " name should be "
+ UCharacter.getExtendedName(element.integer) +
" instead of " + (String)element.value);
break;
}
old.integer++;
}
iterator = UCharacter.getName1_0Iterator();
old.integer = 0;
while (iterator.next(element)) {
System.out.println(Integer.toHexString(element.integer) + " " +
(String)element.value);
if (element.integer != 0 && element.integer <= old.integer) {
errln("FAIL next returned a less codepoint \\u" +
Integer.toHexString(element.integer) + " than \\u" +
Integer.toHexString(old.integer));
break;
}
if (!element.value.equals(UCharacter.getName1_0(element.integer))) {
errln("FAIL next codepoint \\u" +
Integer.toHexString(element.integer) +
" name cannot be null");
break;
}
for (int i = old.integer + 1; i < element.integer; i ++) {
if (UCharacter.getName1_0(i) != null) {
errln("FAIL between codepoints are not null \\u" +
Integer.toHexString(old.integer) + " and " +
Integer.toHexString(element.integer) + " has " +
Integer.toHexString(i) + " with a name " +
UCharacter.getName1_0(i));
break;
}
}
old.integer = element.integer;
}
/* ### TODO: test error cases and other interesting things */
}
/**
* Testing the for illegal characters
*/
@ -1069,19 +1201,5 @@ public final class UCharacterTest extends TestFmwk
}
return result;
}
public static void main(String[] arg)
{
try
{
UCharacterTest test = new UCharacterTest();
test.TestCaseTitle();
//test.run(arg);
}
catch (Exception e)
{
e.printStackTrace();
}
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $
* $Date: 2002/03/02 02:04:09 $
* $Revision: 1.27 $
* $Date: 2002/03/08 02:04:00 $
* $Revision: 1.28 $
*
*******************************************************************************
*/
@ -18,6 +18,7 @@ import com.ibm.icu.impl.UnicodeProperty;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.ValueIterator;
import com.ibm.icu.text.BreakIterator;
/**
@ -879,7 +880,7 @@ public final class UCharacter
*/
public static String getUnicodeVersion()
{
return PROPERTY_.m_unicodeVersion_;
return PROPERTY_.m_unicodeVersion_.toString();
}
/**
@ -1067,6 +1068,7 @@ public final class UCharacter
* @param breakiter break iterator to determine the positions in which
* the character should be title cased.
* @return lowercase version of the argument string
* @draft 2.1
*/
public static String toTitleCase(String str, BreakIterator breakiter)
{
@ -1117,6 +1119,7 @@ public final class UCharacter
* @param breakiter break iterator to determine the positions in which
* the character should be title cased.
* @return lowercase version of the argument string
* @draft 2.1
*/
public static String toTitleCase(Locale locale, String str,
BreakIterator breakiter)
@ -1340,13 +1343,14 @@ public final class UCharacter
* Example of use:<br>
* <pre>
* RangeValueIterator iterator = UCharacter.getTypeIterator();
* while (iterator.next()) {
* RangeValueIterator.Element element = new RangeValueIterator.Element();
* while (iterator.next(element)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(iterator.getStart()) +
* Integer.toHexString(element.start) +
* " to codepoint \\u" +
* Integer.toHexString(iterator.getLimit() - 1) +
* Integer.toHexString(element.limit - 1) +
* " has the character type " +
* iterator.getValue());
* element.value);
* }
* </pre>
* @return an iterator
@ -1356,6 +1360,98 @@ public final class UCharacter
{
return new UCharacterTypeIterator();
}
/**
* <p>Gets an iterator for character names, iterating over codepoints.</p>
* <p>This API only gets the iterator for the modern, most up-to-date
* Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
* for extended names use getExtendedNameIterator().</p>
* Example of use:<br>
* <pre>
* ValueIterator iterator = UCharacter.getNameIterator();
* ValueIterator.Element element = new ValueIterator.Element();
* while (iterator.next(element)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(element.codepoint) +
* " has the name " + (String)element.value);
* }
* </pre>
* @return an iterator
* @draft 2.1
*/
public static ValueIterator getNameIterator()
{
return new UCharacterNameIterator(NAME_,
UCharacterNameChoice.U_UNICODE_CHAR_NAME);
}
/**
* <p>Gets an iterator for character names, iterating over codepoints.</p>
* <p>This API only gets the iterator for the older 1.0 Unicode names.
* For modern, most up-to-date Unicode names use getNameIterator() or
* for extended names use getExtendedNameIterator().</p>
* Example of use:<br>
* <pre>
* ValueIterator iterator = UCharacter.get1_0NameIterator();
* ValueIterator.Element element = new ValueIterator.Element();
* while (iterator.next(element)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(element.codepoint) +
* " has the name " + (String)element.value);
* }
* </pre>
* @return an iterator
* @draft 2.1
*/
public static ValueIterator getName1_0Iterator()
{
return new UCharacterNameIterator(NAME_,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
/**
* <p>Gets an iterator for character names, iterating over codepoints.</p>
* <p>This API only gets the iterator for the extended names.
* For modern, most up-to-date Unicode names use getNameIterator() or
* for older 1.0 Unicode names use get1_0NameIterator().</p>
* Example of use:<br>
* <pre>
* ValueIterator iterator = UCharacter.getExtendedNameIterator();
* ValueIterator.Element element = new ValueIterator.Element();
* while (iterator.next(element)) {
* System.out.println("Codepoint \\u" +
* Integer.toHexString(element.codepoint) +
* " has the name " + (String)element.value);
* }
* </pre>
* @return an iterator
* @draft 2.1
*/
public static ValueIterator getExtendedNameIterator()
{
return new UCharacterNameIterator(NAME_,
UCharacterNameChoice.U_EXTENDED_CHAR_NAME);
}
// protected data members --------------------------------------------
/**
* Database storing the sets of character name
*/
protected static final UCharacterName NAME_;
// block to initialise name database and unicode 1.0 data indicator
static
{
try
{
NAME_ = new UCharacterName();
}
catch (Exception e)
{
throw new RuntimeException(e.getMessage());
}
}
// protected methods -------------------------------------------------
@ -1382,24 +1478,6 @@ public final class UCharacter
private static final UCharacterProperty PROPERTY_ =
UnicodeProperty.PROPERTY;
/**
* Database storing the sets of character name
*/
private static final UCharacterName NAME_;
// block to initialise name database and unicode 1.0 data indicator
static
{
try
{
NAME_ = new UCharacterName();
}
catch (Exception e)
{
throw new RuntimeException(e.getMessage());
}
}
/**
* To get the last character out from a data type
*/

View file

@ -6,8 +6,8 @@
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterName.java $
* $Date: 2002/03/02 01:50:51 $
* $Revision: 1.13 $
* $Date: 2002/03/08 02:04:00 $
* $Revision: 1.14 $
*
*******************************************************************************
*/
@ -273,8 +273,7 @@ final class UCharacterName
indexes[0] = offset;
// joining up the factorized strings
if (compareFactorString(indexes,
name.substring(prefixlen))) {
if (compareFactorString(indexes, name, prefixlen)) {
return ch;
}
}
@ -337,16 +336,18 @@ final class UCharacterName
* the argument string
* @param index array with each index corresponding to each factor block
* @param str string to compare with
* @param offset of str to start comparison
* @return true if string matches
*/
private boolean compareFactorString(int index[], String str)
private boolean compareFactorString(int index[], String str,
int offset)
{
int size = m_factor_.length;
if (index == null || index.length != size)
return false;
int count = 0;
int strcount = 0;
int strcount = offset;
int factor;
size --;
for (int i = 0; i <= size; i ++)
@ -372,6 +373,22 @@ final class UCharacterName
}
}
// protected data members --------------------------------------------
/**
* Maximum number of groups
*/
protected int m_groupcount_ = 0;
/**
* Size of each groups
*/
protected int m_groupsize_ = 0;
/**
* Number of lines per group
* 1 << GROUP_SHIFT_
*/
protected static final int LINES_PER_GROUP_ = 1 << 5;
// protected constructor ---------------------------------------------
/**
@ -541,113 +558,6 @@ final class UCharacterName
return false;
}
// private data members ----------------------------------------------
/**
* Data used in unames.dat
*/
private char m_tokentable_[];
private byte m_tokenstring_[];
private char m_groupinfo_[];
private byte m_groupstring_[];
private AlgorithmName m_algorithm_[];
/**
* Number of group sets
*/
private int m_groupcount_ = 0;
private int m_groupsize_ = 0;
/**
* Default name of the name datafile
*/
private static final String NAME_FILE_NAME_ =
"/com/ibm/icu/impl/data/unames.dat";
/**
* Default buffer size of datafile
*/
private static final int NAME_BUFFER_SIZE_ = 100000;
/**
* Shift count to retrieve group information
*/
private static final int GROUP_SHIFT_ = 5;
/**
* Number of lines per group
*/
private static final int LINES_PER_GROUP_ = 1 << GROUP_SHIFT_;
/**
* Mask to retrieve the offset for a particular character within a group
*/
private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
/**
* Position of offsethigh in group information array
*/
private static final int OFFSET_HIGH_OFFSET_ = 1;
/**
* Position of offsetlow in group information array
*/
private static final int OFFSET_LOW_OFFSET_ = 2;
/**
* Double nibble indicator, any nibble > this number has to be combined
* with its following nibble
*/
private static final int SINGLE_NIBBLE_MAX_ = 11;
// private methods ---------------------------------------------------
/**
* Gets the algorithmic name for the argument character
* @param ch character to determine name for
* @param choice name choice
* @return the algorithmic name or null if not found
*/
private String getAlgName(int ch, int choice)
{
// Do not write algorithmic Unicode 1.0 names because Unihan names are
// the same as the modern ones, extension A was only introduced with
// Unicode 3.0, and the Hangul syllable block was moved and changed
// around Unicode 1.1.5.
if (choice != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) {
// index in terms integer index
StringBuffer s = new StringBuffer();
for (int index = m_algorithm_.length - 1; index >= 0; index --) {
if (m_algorithm_[index].contains(ch)) {
if (index >= 0) {
m_algorithm_[index].appendName(ch, s);
return s.toString();
}
}
}
}
return null;
}
/**
* Getting the character with the tokenized argument name
* @param name of the character
* @return character with the tokenized argument name or -1 if character
* is not found
*/
private int getGroupChar(String name, int choice)
{
int result = 0;
for (int i = 0; i < m_groupcount_; i ++) {
result = getGroupChar(i, name, choice);
if (result != -1) {
return result;
}
}
return -1;
}
/**
* Reads a block of compressed lengths of 32 strings and expands them into
* offsets and lengths for each string. Lengths are stored with a
@ -664,7 +574,7 @@ final class UCharacterName
* @return next index of the data string immediately after the lengths
* in terms of byte address
*/
private int getGroupLengths(int index, char offsets[], char lengths[])
protected int getGroupLengths(int index, char offsets[], char lengths[])
{
char length = 0xffff;
byte b = 0,
@ -687,22 +597,22 @@ final class UCharacterName
// getting nibble
n = (byte)((b >> shift) & 0x0F);
if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) {
length = (char)((n - 12) << 4);
length = (char)((n - 12) << 4);
}
else {
if (length != 0xffff) {
lengths[i] = (char)((length | n) + 12);
}
else {
lengths[i] = (char)n;
}
if (length != 0xffff) {
lengths[i] = (char)((length | n) + 12);
}
else {
lengths[i] = (char)n;
}
if (i < LINES_PER_GROUP_) {
offsets[i + 1] = (char)(offsets[i] + lengths[i]);
}
if (i < LINES_PER_GROUP_) {
offsets[i + 1] = (char)(offsets[i] + lengths[i]);
}
length = 0xffff;
i ++;
length = 0xffff;
i ++;
}
shift -= 4;
@ -710,7 +620,7 @@ final class UCharacterName
}
return stringoffset;
}
/**
* Gets the name of the argument group index
* @param index of the group name string in byte count
@ -718,13 +628,13 @@ final class UCharacterName
* @param choice of Unicode 1.0 name or the most current name
* @return name of the group
*/
private String getGroupName(int index, int length, int choice)
protected String getGroupName(int index, int length, int choice)
{
if (choice == UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) {
int oldindex = index;
index += UCharacterUtil.skipByteSubString(m_groupstring_, index,
length, (byte)';');
length -= (index - oldindex);
int oldindex = index;
index += UCharacterUtil.skipByteSubString(m_groupstring_,
index, length, (byte)';');
length -= (index - oldindex);
}
StringBuffer s = new StringBuffer();
@ -736,7 +646,7 @@ final class UCharacterName
if (b >= m_tokentable_.length) {
if (b == ';') {
break;
break;
}
s.append(b); // implicit letter
}
@ -750,6 +660,13 @@ final class UCharacterName
}
if (token == 0xFFFF) {
if (b == ';') {
// skip the semicolon if we are seeking extended
// names and there was no 2.0 name but there
// is a 1.0 name.
if (s.length() == 0 && choice ==
UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
continue;
}
break;
}
s.append((char)(b & 0x00ff)); // explicit letter
@ -766,6 +683,300 @@ final class UCharacterName
}
return s.toString();
}
/**
* Retrieves the extended name
*/
protected String getExtendedName(int ch)
{
String result = getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);
if (result == null) {
if (getType(ch) == UCharacterCategory.CONTROL) {
result = getName(ch,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
if (result == null) {
result = getExtendedOr10Name(ch);
}
}
return result;
}
/**
* Gets the group index for the codepoint, or the group before it.
* @param codepoint
* @return group index containing codepoint or the group before it.
*/
protected int getGroup(int codepoint)
{
int endGroup = m_groupcount_;
int msb = getCodepointMSB(codepoint);
int result = 0;
// binary search for the group of names that contains the one for
// code
// find the group that contains codepoint, or the highest before it
while (result < endGroup - 1) {
int gindex = (result + endGroup) >> 1;
if (msb < getGroupMSB(gindex)) {
endGroup = gindex;
}
else {
result = gindex;
}
}
return result;
}
/**
* Gets the extended and 1.0 name when the most current unicode names
* fail
* @param ch codepoint
* @return name of codepoint extended or 1.0
*/
protected String getExtendedOr10Name(int ch)
{
String result = null;
if (getType(ch) == UCharacterCategory.CONTROL) {
result = getName(ch,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
if (result == null) {
int type = getType(ch);
// Return unknown if the table of names above is not up to
// date.
if (type >= UCharacterCategory.TYPE_NAMES_.length) {
result = UCharacterCategory.UNKNOWN_TYPE_NAME_;
}
else {
result = UCharacterCategory.TYPE_NAMES_[type];
}
StringBuffer tempResult = new StringBuffer(result);
tempResult.insert(0, '<');
tempResult.append('-');
String chStr = Integer.toHexString(ch).toUpperCase();
int zeros = 4 - chStr.length();
while (zeros > 0) {
tempResult.append('0');
zeros --;
}
tempResult.append(chStr);
tempResult.append('>');
result = tempResult.toString();
}
return result;
}
// these are all UCharacterNameIterator use methods -------------------
/**
* Gets the MSB from the group index
* @param gindex group index
* @return the MSB of the group if gindex is valid, -1 otherwise
*/
protected int getGroupMSB(int gindex)
{
if (gindex >= m_groupcount_) {
return -1;
}
return m_groupinfo_[gindex * m_groupsize_];
}
/**
* Gets the MSB of the codepoint
* @param codepoint
* @return the MSB of the codepoint
*/
protected int getCodepointMSB(int codepoint)
{
return codepoint >> GROUP_SHIFT_;
}
/**
* Gets the maximum codepoint + 1 of the group
* @param msb most significant byte of the group
* @return limit codepoint of the group
*/
protected int getGroupLimit(int msb)
{
return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_;
}
/**
* Gets the minimum codepoint of the group
* @param msb most significant byte of the group
* @return minimum codepoint of the group
*/
protected int getGroupMin(int msb)
{
return msb << GROUP_SHIFT_;
}
/**
* Gets the offset to a group
* @param codepoint
* @return offset to a group
*/
protected int getGroupOffset(int codepoint)
{
return codepoint & GROUP_MASK_;
}
/**
* Gets the minimum codepoint of a group
* @param codepoint
* @return minimum codepoint in the group which codepoint belongs to
*/
protected int getGroupMinFromCodepoint(int codepoint)
{
return codepoint & ~GROUP_MASK_;
}
/**
* Get the Algorithm range length
* @return Algorithm range length
*/
protected int getAlgorithmLength()
{
return m_algorithm_.length;
}
/**
* Gets the start of the range
* @param index algorithm index
* @return algorithm range start
*/
protected int getAlgorithmStart(int index)
{
return m_algorithm_[index].m_rangestart_;
}
/**
* Gets the end of the range
* @param index algorithm index
* @return algorithm range end
*/
protected int getAlgorithmEnd(int index)
{
return m_algorithm_[index].m_rangeend_;
}
/**
* Gets the Algorithmic name of the codepoint
* @param index algorithmic range index
* @param codepoint
* @return algorithmic name of codepoint
*/
protected String getAlgorithmName(int index, int codepoint)
{
StringBuffer result = new StringBuffer();
m_algorithm_[index].appendName(codepoint, result);
return result.toString();
}
// private data members ----------------------------------------------
/**
* Data used in unames.dat
*/
private char m_tokentable_[];
private byte m_tokenstring_[];
private char m_groupinfo_[];
private byte m_groupstring_[];
private AlgorithmName m_algorithm_[];
/**
* Group use
*/
private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1];
private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1];
/**
* Default name of the name datafile
*/
private static final String NAME_FILE_NAME_ =
"/com/ibm/icu/impl/data/unames.dat";
/**
* Shift count to retrieve group information
*/
private static final int GROUP_SHIFT_ = 5;
/**
* Mask to retrieve the offset for a particular character within a group
*/
private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
/**
* Default buffer size of datafile
*/
private static final int NAME_BUFFER_SIZE_ = 100000;
/**
* Position of offsethigh in group information array
*/
private static final int OFFSET_HIGH_OFFSET_ = 1;
/**
* Position of offsetlow in group information array
*/
private static final int OFFSET_LOW_OFFSET_ = 2;
/**
* Double nibble indicator, any nibble > this number has to be combined
* with its following nibble
*/
private static final int SINGLE_NIBBLE_MAX_ = 11;
// private methods ---------------------------------------------------
/**
* Gets the algorithmic name for the argument character
* @param ch character to determine name for
* @param choice name choice
* @return the algorithmic name or null if not found
*/
private String getAlgName(int ch, int choice)
{
// Do not write algorithmic Unicode 1.0 names because Unihan names are
// the same as the modern ones, extension A was only introduced with
// Unicode 3.0, and the Hangul syllable block was moved and changed
// around Unicode 1.1.5.
if (choice != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) {
// index in terms integer index
StringBuffer s = new StringBuffer();
for (int index = m_algorithm_.length - 1; index >= 0; index --) {
if (m_algorithm_[index].contains(ch)) {
m_algorithm_[index].appendName(ch, s);
return s.toString();
}
}
}
return null;
}
/**
* Getting the character with the tokenized argument name
* @param name of the character
* @return character with the tokenized argument name or -1 if character
* is not found
*/
private synchronized int getGroupChar(String name, int choice)
{
for (int i = 0; i < m_groupcount_; i ++) {
// populating the data set of grouptable
int startgpstrindex = getGroupLengths(i, m_groupoffsets_,
m_grouplengths_);
// shift out to function
int result = getGroupChar(startgpstrindex, m_grouplengths_, name,
choice);
if (result != -1) {
return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_)
| result;
}
}
return -1;
}
/**
* Compares and retrieve character if name is found within the argument
@ -884,62 +1095,45 @@ final class UCharacterName
* @param ch character to get the group name
* @param choice name choice selector to choose a unicode 1.0 or newer name
*/
private String getGroupName(int ch, int choice)
private synchronized String getGroupName(int ch, int choice)
{
// gets the msb
int msb = ch >> GROUP_SHIFT_,
end = m_groupcount_,
start,
gindex = 0;
// binary search for the group of names that contains the one for
// code
for (start = 0; start < end - 1;) {
gindex = (start + end) >> 1;
if (msb < m_groupinfo_[gindex * m_groupsize_]) {
end = gindex;
}
else {
start = gindex;
}
}
int msb = getCodepointMSB(ch);
int group = getGroup(ch);
// return this if it is an exact match
if (msb == m_groupinfo_[start * m_groupsize_]) {
char offsets[] = new char[LINES_PER_GROUP_ + 1];
char lengths[] = new char[LINES_PER_GROUP_ + 1];
int index = getGroupLengths(start, offsets, lengths);
if (msb == m_groupinfo_[group * m_groupsize_]) {
int index = getGroupLengths(group, m_groupoffsets_,
m_grouplengths_);
int offset = ch & GROUP_MASK_;
return getGroupName(index + offsets[offset], lengths[offset],
choice);
return getGroupName(index + m_groupoffsets_[offset],
m_grouplengths_[offset], choice);
}
return null;
}
/**
* Getting the character with the tokenized argument name
* @param index of the group to check
* @param name of the character
* @param choice of Unicode version used
* @return character with the tokenized argument name or -1 if character
* is not found
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private int getGroupChar(int index, String name, int choice)
private int getType(int ch)
{
// populating the data set of grouptable
char offsets[] = new char[LINES_PER_GROUP_ + 1];
char lengths[] = new char[LINES_PER_GROUP_ + 1];
int startgpstrindex = getGroupLengths(index, offsets, lengths);
// shift out to function
int result = getGroupChar(startgpstrindex, lengths, name, choice);
if (result != -1) {
return (m_groupinfo_[index * m_groupsize_] << GROUP_SHIFT_) |
result;
}
return -1;
if (UCharacter.isNonCharacter(ch)) {
// not a character we return a invalid category count
return UCharacterCategory.NON_CHARACTER_;
}
int result = UCharacter.getType(ch);
if (result == UCharacterCategory.SURROGATE) {
if (ch <= UnicodeProperty.LEAD_SURROGATE_MAX_VALUE) {
result = UCharacterCategory.LEAD_SURROGATE_;
}
else {
result = UCharacterCategory.TRAIL_SURROGATE_;
}
}
return result;
}
/**
@ -987,65 +1181,4 @@ final class UCharacterName
}
return -2;
}
/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private int getType(int ch)
{
if (UCharacter.isNonCharacter(ch)) {
// not a character we return a invalid category count
return UCharacterCategory.NON_CHARACTER_;
}
int result = UCharacter.getType(ch);
if (result == UCharacterCategory.SURROGATE) {
if (ch <= UnicodeProperty.LEAD_SURROGATE_MAX_VALUE) {
result = UCharacterCategory.LEAD_SURROGATE_;
}
else {
result = UCharacterCategory.TRAIL_SURROGATE_;
}
}
return result;
}
/**
* Retrieves the extended name
*/
private String getExtendedName(int ch)
{
String result = getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);
if (result == null) {
if (getType(ch) == UCharacterCategory.CONTROL) {
result = getName(ch,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
if (result == null) {
int type = getType(ch);
// Return unknown if the table of names above is not up to
// date.
if (type >= UCharacterCategory.TYPE_NAMES_.length) {
result = UCharacterCategory.UNKNOWN_TYPE_NAME_;
}
else {
result = UCharacterCategory.TYPE_NAMES_[type];
}
StringBuffer tempResult = new StringBuffer(result);
tempResult.insert(0, '<');
tempResult.append('-');
String chStr = Integer.toHexString(ch).toUpperCase();
int zeros = 4 - chStr.length();
while (zeros > 0) {
tempResult.append('0');
zeros --;
}
tempResult.append(chStr);
tempResult.append('>');
result = tempResult.toString();
}
}
return result;
}
}

View file

@ -0,0 +1,313 @@
/*
******************************************************************************
* Copyright (C) 1996-2002, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java,v $
* $Date: 2002/03/08 02:04:00 $
* $Revision: 1.1 $
*
******************************************************************************
*/
package com.ibm.icu.lang;
import com.ibm.icu.util.ValueIterator;
/**
* Class enabling iteration of the codepoints and their names.
* Result of each iteration contains a valid codepoints that have the result
* name.
* See UCharacter.getNameIterator() for an example of use.
* @author synwee
* @since release 2.1, March 5 2002
*/
class UCharacterNameIterator implements ValueIterator
{
// public methods ----------------------------------------------------
/**
* <p>Gets the next result for this iteration and returns
* true if we are not at the end of the iteration, false otherwise.</p>
* <p>If the return boolean is a false, the contents of elements will not
* be updated.</p>
* @param element for storing the result range and value
* @return true if we are not at the end of the iteration, false otherwise.
* @see Element
* @draft 2.1
*/
public boolean next(ValueIterator.Element element)
{
if (m_current_ >= m_limit_) {
return false;
}
if (m_choice_ != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) {
int length = m_name_.getAlgorithmLength();
if (m_algorithmIndex_ < length) {
while (m_algorithmIndex_ < length) {
// find the algorithm range that could contain m_current_
if (m_algorithmIndex_ < 0 ||
m_name_.getAlgorithmEnd(m_algorithmIndex_) <
m_current_) {
m_algorithmIndex_ ++;
}
else {
break;
}
}
if (m_algorithmIndex_ < length) {
// interleave the data-driven ones with the algorithmic ones
// iterate over all algorithmic ranges; assume that they are
// in ascending order
int start = m_name_.getAlgorithmStart(m_algorithmIndex_);
if (m_current_ < start) {
// this should get rid of those codepoints that are not
// in the algorithmic range
int end = start;
if (m_limit_ <= start) {
end = m_limit_;
}
if (!iterateGroup(element, end)) {
m_current_ ++;
return true;
}
}
if (m_current_ >= m_limit_) {
// after iterateGroup fails, current codepoint may be
// greater than limit
return false;
}
element.integer = m_current_;
element.value = m_name_.getAlgorithmName(m_algorithmIndex_,
m_current_);
// reset the group index if we are in the algorithmic names
m_groupIndex_ = -1;
m_current_ ++;
return true;
}
}
}
// enumerate the character names after the last algorithmic range
if (!iterateGroup(element, m_limit_)) {
m_current_ ++;
return true;
}
else if (m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
if (!iterateExtended(element, m_limit_)) {
m_current_ ++;
return true;
}
}
return false;
}
/**
* Resets the iterator to the beginning of the iteration.
* @draft 2.1
*/
public void reset()
{
m_current_ = m_start_;
m_groupIndex_ = -1;
m_algorithmIndex_ = -1;
}
/**
* Sets the range for iteration
* @param start first codepoint to iterate
* @param limit one codepoint after the last codepoint to iterate
* @exception IllegalArgumentException thrown when start or limit exceed
* the Unicode codepoint bounds or when start > limit.
*/
public void setRange(int start, int limit)
{
if (start > limit || start < UCharacter.MIN_VALUE ||
limit > UCharacter.MAX_VALUE + 1) {
throw new IllegalArgumentException(
"start or limit has to be valid Unicode codepoints and start <= limit");
}
m_start_ = start;
m_limit_ = limit;
m_current_ = start;
}
// protected constructor ---------------------------------------------
/**
* Constructor
* @param name name data
* @param choice name choice from the class
* com.ibm.icu.lang.UCharacterNameChoice
* @draft 2.1
*/
protected UCharacterNameIterator(UCharacterName name, int choice)
{
m_name_ = name;
// no explicit choice in UCharacter so no checks on choice
m_choice_ = choice;
m_start_ = UCharacter.MIN_VALUE;
m_limit_ = UCharacter.MAX_VALUE + 1;
m_current_ = m_start_;
}
// private data members ---------------------------------------------
/**
* Name data
*/
private UCharacterName m_name_;
/**
* Name choice
*/
private int m_choice_;
/**
* Start iteration range
*/
private int m_start_;
/**
* End + 1 iteration range
*/
private int m_limit_;
/**
* Current codepoint
*/
private int m_current_;
/**
* Group index
*/
private int m_groupIndex_ = -1;
/**
* Algorithm index
*/
private int m_algorithmIndex_ = -1;
/**
* Group use
*/
private static char GROUP_OFFSETS_[] =
new char[UCharacterName.LINES_PER_GROUP_ + 1];
private static char GROUP_LENGTHS_[] =
new char[UCharacterName.LINES_PER_GROUP_ + 1];
// private methods --------------------------------------------------
/**
* Group name iteration, iterate all the names in the current 32-group and
* returns the first codepoint that has a valid name.
* @param result stores the result codepoint and name
* @param limit last codepoint + 1 in range to search
* @return false if a codepoint with a name is found in group and we can
* bail from further iteration, true to continue on with the
* iteration
*/
private boolean iterateSingleGroup(Element result, int limit)
{
synchronized(GROUP_OFFSETS_) {
synchronized(GROUP_LENGTHS_) {
int index = m_name_.getGroupLengths(m_groupIndex_, GROUP_OFFSETS_,
GROUP_LENGTHS_);
while (m_current_ < limit) {
int offset = m_name_.getGroupOffset(m_current_);
String name = m_name_.getGroupName(
index + GROUP_OFFSETS_[offset],
GROUP_LENGTHS_[offset], m_choice_);
if ((name == null || name.length() == 0) &&
m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
name = m_name_.getExtendedName(m_current_);
}
if (name != null && name.length() > 0) {
result.integer = m_current_;
result.value = name;
return false;
}
++ m_current_;
}
}
}
return true;
}
/**
* Group name iteration, iterate all the names in the current 32-group and
* returns the first codepoint that has a valid name.
* @param result stores the result codepoint and name
* @param limit last codepoint + 1 in range to search
* @return false if a codepoint with a name is found in group and we can
* bail from further iteration, true to continue on with the
* iteration
*/
private boolean iterateGroup(Element result, int limit)
{
if (m_groupIndex_ < 0) {
m_groupIndex_ = m_name_.getGroup(m_current_);
}
while (m_groupIndex_ < m_name_.m_groupcount_ &&
m_current_ < limit) {
// iterate till the last group or the last codepoint
int startMSB = m_name_.getCodepointMSB(m_current_);
int gMSB = m_name_.getGroupMSB(m_groupIndex_); // can be -1
if (startMSB == gMSB) {
if (startMSB == m_name_.getCodepointMSB(limit - 1)) {
// if start and limit - 1 are in the same group, then enumerate
// only in that one
return iterateSingleGroup(result, limit);
}
// enumerate characters in the partial start group
// if (m_name_.getGroupOffset(m_current_) != 0) {
if (!iterateSingleGroup(result,
m_name_.getGroupLimit(gMSB))) {
return false;
}
++ m_groupIndex_; // continue with the next group
}
else if (startMSB > gMSB) {
// make sure that we start enumerating with the first group
// after start
m_groupIndex_ ++;
}
else {
int gMIN = m_name_.getGroupMin(gMSB);
if (gMIN > limit) {
gMIN = limit;
}
if (m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
if (!iterateExtended(result, gMIN)) {
return false;
}
}
m_current_ = gMIN;
}
}
return true;
}
/**
* Iterate extended names.
* @param result stores the result codepoint and name
* @param limit last codepoint + 1 in range to search
* @return false if a codepoint with a name is found and we can
* bail from further iteration, true to continue on with the
* iteration (this will always be false for valid codepoints)
*/
private boolean iterateExtended(UCharacterNameIterator.Element result,
int limit)
{
while (m_current_ < limit) {
String name = m_name_.getExtendedOr10Name(m_current_);
if (name != null && name.length() > 0) {
result.integer = m_current_;
result.value = name;
return false;
}
++ m_current_;
}
return true;
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/Attic/UCharacterNameReader.java,v $
* $Date: 2002/02/28 23:42:04 $
* $Revision: 1.4 $
* $Date: 2002/03/08 02:04:00 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -92,7 +92,19 @@ final class UCharacterNameReader
size = m_algnamesindex_ - m_groupstringindex_;
byte groupstring[] = new byte[size];
System.out.println("size " + size);
m_dataInputStream_.readFully(groupstring);
for (int i = 0; i < size; i ++) {
if (groupstring[i] == 0x14 &&
groupstring[i + 1] == 0x12 &&
groupstring[i + 2] == 0x3e &&
groupstring[i + 3] == 0x01 &&
groupstring[i + 4] == 0x39 &&
groupstring[i + 5] == 0x4 &&
groupstring[i + 6] == 0x1e)
System.out.println("found at " + i);
}
data.setGroup(group, groupstring);
count = m_dataInputStream_.readInt();