ICU-1707

New character name iteration X-SVN-Rev: 7913
2025-04-21 12:40:02 +00:00 · 2002-03-08 02:04:00 +00:00 · 2002-03-08 02:04:00 +00:00 · 51df46827d
commit 51df46827d
parent 2868b2a4d6
5 changed files with 937 additions and 283 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterTest.java,v $ 
-* $Date: 2002/03/02 02:04:07 $ 
-* $Revision: 1.30 $
+* $Date: 2002/03/08 02:03:16 $ 
+* $Revision: 1.31 $
 *
 *******************************************************************************
 */
@ -24,6 +24,7 @@ import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UCharacterCategory;
 import com.ibm.icu.lang.UCharacterDirection;
 import com.ibm.icu.util.RangeValueIterator;
+import com.ibm.icu.util.ValueIterator;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.impl.Utility;
 import com.ibm.icu.text.BreakIterator;
@ -54,6 +55,21 @@ public final class UCharacterTest extends TestFmwk
  
  // public methods ================================================
  
+  public static void main(String[] arg)
+  {
+    try
+    {
+      UCharacterTest test = new UCharacterTest();
+      UCharacter.getName1_0(0x1d18b);
+      test.TestNameIteration();
+      //test.run(arg);
+    }
+    catch (Exception e)
+    {
+      e.printStackTrace();
+    }
+  }
+  
  /**
  * Testing the uppercase and lowercase function of UCharacter
  */
@ -635,8 +651,7 @@ public final class UCharacterTest extends TestFmwk
        errln(
          "FAIL: 'LATin smALl letTER A' should result in character U+0061"); 
    } 
-
-    
+	    
    // extra testing different from icu
    for (int i = UCharacter.MIN_VALUE; i < UCharacter.MAX_VALUE; i ++)
    {
@ -650,6 +665,123 @@ public final class UCharacterTest extends TestFmwk
    }
  }
  
+  /**
+   * Testing name iteration
+   */
+  public void TestNameIteration()
+  {
+  	ValueIterator iterator = UCharacter.getNameIterator();
+  	ValueIterator.Element element = new ValueIterator.Element();
+    ValueIterator.Element old     = new ValueIterator.Element();
+    // testing subrange
+ 	iterator.setRange(0xF, 0x45);
+ 	while (iterator.next(element)) {
+    	if (element.integer <= old.integer) {
+         	errln("FAIL next returned a less codepoint \\u" + 
+         	      Integer.toHexString(element.integer) + " than \\u" + 
+         	      Integer.toHexString(old.integer));
+         	break;
+        }
+        if (!UCharacter.getName(element.integer).equals(element.value)) {
+         	errln("FAIL next codepoint \\u" + 
+         	      Integer.toHexString(element.integer) + 
+         	      " does not have the expected name " + 
+         	      UCharacter.getName(element.integer) + 
+         	      " instead have the name " + (String)element.value);
+         	break;
+        }
+        old.integer = element.integer; 
+    }
+    
+    iterator.reset();
+    iterator.next(element);
+    if (element.integer != 0x20) {
+    	errln("FAIL reset in iterator");
+    }
+ 
+    iterator.setRange(0, 0x110000);
+    old.integer = 0; 
+    while (iterator.next(element)) {
+    	if (element.integer != 0 && element.integer <= old.integer) {
+         	errln("FAIL next returned a less codepoint \\u" + 
+         	      Integer.toHexString(element.integer) + " than \\u" + 
+         	      Integer.toHexString(old.integer));
+         	break;
+        }
+        if (!UCharacter.getName(element.integer).equals(element.value)) {
+         	errln("FAIL next codepoint \\u" + 
+         	      Integer.toHexString(element.integer) + 
+         	      " does not have the expected name " + 
+         	      UCharacter.getName(element.integer) + 
+         	      " instead have the name " + (String)element.value);
+         	break;
+        }
+        for (int i = old.integer + 1; i < element.integer; i ++) {
+        	if (UCharacter.getName(i) != null) {
+         		errln("FAIL between codepoints are not null \\u" + 
+         	      	Integer.toHexString(old.integer) + " and " + 
+         	      	Integer.toHexString(element.integer) + " has " + 
+         	      	Integer.toHexString(i) + " with a name " + 
+         	      	UCharacter.getName(i));
+         		break;
+        	}
+        }
+        old.integer = element.integer; 
+    }
+    
+    iterator = UCharacter.getExtendedNameIterator();
+    old.integer = 0;
+    while (iterator.next(element)) {
+    	if (element.integer != 0 && element.integer != old.integer) {
+         	errln("FAIL next returned a codepoint \\u" + 
+         	      Integer.toHexString(element.integer) + 
+         	      " different from \\u" + 
+         	      Integer.toHexString(old.integer));
+         	break;
+        }
+        if (!UCharacter.getExtendedName(element.integer).equals(
+                                                          element.value)) {
+         	errln("FAIL next codepoint \\u" + 
+         	      Integer.toHexString(element.integer) + " name should be "
+         	      + UCharacter.getExtendedName(element.integer) + 
+         	      " instead of " + (String)element.value);
+         	break;
+        }
+        old.integer++; 
+    }
+	iterator = UCharacter.getName1_0Iterator();
+    old.integer = 0;
+    while (iterator.next(element)) {
+    	System.out.println(Integer.toHexString(element.integer) + " " +
+    	                   (String)element.value);
+    	if (element.integer != 0 && element.integer <= old.integer) {
+         	errln("FAIL next returned a less codepoint \\u" + 
+         	      Integer.toHexString(element.integer) + " than \\u" + 
+         	      Integer.toHexString(old.integer));
+         	break;
+        }
+        if (!element.value.equals(UCharacter.getName1_0(element.integer))) {
+         	errln("FAIL next codepoint \\u" + 
+         	      Integer.toHexString(element.integer) + 
+         	      " name cannot be null");
+         	break;
+        }
+        for (int i = old.integer + 1; i < element.integer; i ++) {
+        	if (UCharacter.getName1_0(i) != null) {
+         		errln("FAIL between codepoints are not null \\u" + 
+         	      	Integer.toHexString(old.integer) + " and " + 
+         	      	Integer.toHexString(element.integer) + " has " + 
+         	      	Integer.toHexString(i) + " with a name " + 
+         	      	UCharacter.getName1_0(i));
+         		break;
+        	}
+        }
+        old.integer = element.integer; 
+    }
+
+    /* ### TODO: test error cases and other interesting things */
+  }
+  
  /**
  * Testing the for illegal characters
  */
@ -1069,19 +1201,5 @@ public final class UCharacterTest extends TestFmwk
    }
    return result;
  }
- 
-  public static void main(String[] arg)
-  {
-    try
-    {
-      UCharacterTest test = new UCharacterTest();
-      test.TestCaseTitle();
-      //test.run(arg);
-    }
-    catch (Exception e)
-    {
-      e.printStackTrace();
-    }
-  }
 }

--- a/icu4j/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/src/com/ibm/icu/lang/UCharacter.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $ 
-* $Date: 2002/03/02 02:04:09 $ 
-* $Revision: 1.27 $
+* $Date: 2002/03/08 02:04:00 $ 
+* $Revision: 1.28 $
 *
 *******************************************************************************
 */
@ -18,6 +18,7 @@ import com.ibm.icu.impl.UnicodeProperty;
 import com.ibm.icu.impl.UCharacterProperty;
 import com.ibm.icu.impl.Utility; 
 import com.ibm.icu.util.RangeValueIterator;
+import com.ibm.icu.util.ValueIterator;
 import com.ibm.icu.text.BreakIterator;

 /**
@ -879,7 +880,7 @@ public final class UCharacter
    */
    public static String getUnicodeVersion()
    {
-        return PROPERTY_.m_unicodeVersion_;
+        return PROPERTY_.m_unicodeVersion_.toString();
    }
      
    /**
@ -1067,6 +1068,7 @@ public final class UCharacter
    * @param breakiter break iterator to determine the positions in which
    *        the character should be title cased.
    * @return lowercase version of the argument string
+    * @draft 2.1
    */
    public static String toTitleCase(String str, BreakIterator breakiter)
    {
@ -1117,6 +1119,7 @@ public final class UCharacter
    * @param breakiter break iterator to determine the positions in which
    *        the character should be title cased.
    * @return lowercase version of the argument string
+    * @draft 2.1
    */
    public static String toTitleCase(Locale locale, String str, 
                                     BreakIterator breakiter)
@ -1340,13 +1343,14 @@ public final class UCharacter
    * Example of use:<br>
    * <pre>
    * RangeValueIterator iterator = UCharacter.getTypeIterator();
-    * while (iterator.next()) {
+    * RangeValueIterator.Element element = new RangeValueIterator.Element();
+    * while (iterator.next(element)) {
    *     System.out.println("Codepoint \\u" + 
-    *                        Integer.toHexString(iterator.getStart()) + 
+    *                        Integer.toHexString(element.start) + 
    *                        " to codepoint \\u" +
-    *                        Integer.toHexString(iterator.getLimit() - 1) + 
+    *                        Integer.toHexString(element.limit - 1) + 
    *                        " has the character type " + 
-    *                        iterator.getValue());
+    *                        element.value);
    * }
    * </pre>
    * @return an iterator 
@ -1356,6 +1360,98 @@ public final class UCharacter
    {
        return new UCharacterTypeIterator();
    }
+
+	/**
+    * <p>Gets an iterator for character names, iterating over codepoints.</p>
+    * <p>This API only gets the iterator for the modern, most up-to-date 
+    * Unicode names. For older 1.0 Unicode names use get1_0NameIterator() or
+    * for extended names use getExtendedNameIterator().</p>
+    * Example of use:<br>
+    * <pre>
+    * ValueIterator iterator = UCharacter.getNameIterator();
+    * ValueIterator.Element element = new ValueIterator.Element();
+    * while (iterator.next(element)) {
+    *     System.out.println("Codepoint \\u" + 
+    *                        Integer.toHexString(element.codepoint) +
+    *                        " has the name " + (String)element.value);
+    * }
+    * </pre>
+    * @return an iterator 
+    * @draft 2.1
+    */
+    public static ValueIterator getNameIterator()
+    {
+        return new UCharacterNameIterator(NAME_,
+                                   UCharacterNameChoice.U_UNICODE_CHAR_NAME);
+    }
+    
+    /**
+    * <p>Gets an iterator for character names, iterating over codepoints.</p>
+    * <p>This API only gets the iterator for the older 1.0 Unicode names. 
+    * For modern, most up-to-date Unicode names use getNameIterator() or
+    * for extended names use getExtendedNameIterator().</p>
+    * Example of use:<br>
+    * <pre>
+    * ValueIterator iterator = UCharacter.get1_0NameIterator();
+    * ValueIterator.Element element = new ValueIterator.Element();
+    * while (iterator.next(element)) {
+    *     System.out.println("Codepoint \\u" + 
+    *                        Integer.toHexString(element.codepoint) +
+    *                        " has the name " + (String)element.value);
+    * }
+    * </pre>
+    * @return an iterator 
+    * @draft 2.1
+    */
+    public static ValueIterator getName1_0Iterator()
+    {
+        return new UCharacterNameIterator(NAME_,
+                                 UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
+    }
+    
+    /**
+    * <p>Gets an iterator for character names, iterating over codepoints.</p>
+    * <p>This API only gets the iterator for the extended names. 
+    * For modern, most up-to-date Unicode names use getNameIterator() or
+    * for older 1.0 Unicode names use get1_0NameIterator().</p>
+    * Example of use:<br>
+    * <pre>
+    * ValueIterator iterator = UCharacter.getExtendedNameIterator();
+    * ValueIterator.Element element = new ValueIterator.Element();
+    * while (iterator.next(element)) {
+    *     System.out.println("Codepoint \\u" + 
+    *                        Integer.toHexString(element.codepoint) +
+    *                        " has the name " + (String)element.value);
+    * }
+    * </pre>
+    * @return an iterator 
+    * @draft 2.1
+    */
+    public static ValueIterator getExtendedNameIterator()
+    {
+        return new UCharacterNameIterator(NAME_,
+                                 UCharacterNameChoice.U_EXTENDED_CHAR_NAME);
+    }
+    
+    // protected data members --------------------------------------------
+    
+    /**
+    * Database storing the sets of character name
+    */
+    protected static final UCharacterName NAME_;
+      
+    // block to initialise name database and unicode 1.0 data indicator
+    static
+    {
+        try
+        {
+            NAME_ = new UCharacterName();
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e.getMessage());
+        }
+    }
    
    // protected methods -------------------------------------------------
      
@ -1382,24 +1478,6 @@ public final class UCharacter
    private static final UCharacterProperty PROPERTY_ = 
                                                    UnicodeProperty.PROPERTY;
   
-    /**
-    * Database storing the sets of character name
-    */
-    private static final UCharacterName NAME_;
-      
-    // block to initialise name database and unicode 1.0 data indicator
-    static
-    {
-        try
-        {
-            NAME_ = new UCharacterName();
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e.getMessage());
-        }
-    }
-
    /**
    * To get the last character out from a data type
    */
--- a/icu4j/src/com/ibm/icu/lang/UCharacterName.java
+++ b/icu4j/src/com/ibm/icu/lang/UCharacterName.java
@ -6,8 +6,8 @@
 *
 * $Source: 
 *     /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterName.java $ 
-* $Date: 2002/03/02 01:50:51 $ 
-* $Revision: 1.13 $
+* $Date: 2002/03/08 02:04:00 $ 
+* $Revision: 1.14 $
 *
 *******************************************************************************
 */
@ -273,8 +273,7 @@ final class UCharacterName
                        indexes[0] = offset;

                        // joining up the factorized strings 
-                        if (compareFactorString(indexes, 
-                                                name.substring(prefixlen))) {
+                        if (compareFactorString(indexes, name, prefixlen)) {
                            return ch;
                        }
                    }
@ -337,16 +336,18 @@ final class UCharacterName
        * the argument string
        * @param index array with each index corresponding to each factor block
        * @param str string to compare with
+        * @param offset of str to start comparison
        * @return true if string matches
        */
-        private boolean compareFactorString(int index[], String str)
+        private boolean compareFactorString(int index[], String str, 
+                                            int offset)
        {
            int size = m_factor_.length;
            if (index == null || index.length != size)
                return false;
                
            int count = 0;
-            int strcount = 0;
+            int strcount = offset;
            int factor;
            size --;
            for (int i = 0; i <= size; i ++)
@ -372,6 +373,22 @@ final class UCharacterName
        }
    }
    
+    // protected data members --------------------------------------------
+    
+    /**
+     * Maximum number of groups
+     */
+    protected int m_groupcount_ = 0;
+    /**
+     * Size of each groups
+     */
+    protected int m_groupsize_ = 0;
+    /**
+    * Number of lines per group 
+    * 1 << GROUP_SHIFT_
+    */
+    protected static final int LINES_PER_GROUP_ = 1 << 5;
+    
    // protected constructor ---------------------------------------------
    
    /**
@ -541,113 +558,6 @@ final class UCharacterName
        return false; 
    }
    
-    // private data members ----------------------------------------------
-    
-    /**
-    * Data used in unames.dat
-    */
-    private char m_tokentable_[];
-    private byte m_tokenstring_[];
-    private char m_groupinfo_[];
-    private byte m_groupstring_[];
-    private AlgorithmName m_algorithm_[];
-      
-    /**
-    * Number of group sets
-    */
-    private int m_groupcount_ = 0;
-    private int m_groupsize_ = 0;
-      
-    /**
-    * Default name of the name datafile
-    */
-    private static final String NAME_FILE_NAME_ = 
-                                           "/com/ibm/icu/impl/data/unames.dat";
-      
-    /**
-    * Default buffer size of datafile
-    */
-    private static final int NAME_BUFFER_SIZE_ = 100000;
-      
-    /**
-    * Shift count to retrieve group information
-    */
-    private static final int GROUP_SHIFT_ = 5;
-      
-    /**
-    * Number of lines per group
-    */
-    private static final int LINES_PER_GROUP_ = 1 << GROUP_SHIFT_;
-      
-    /**
-    * Mask to retrieve the offset for a particular character within a group
-    */
-    private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
-      
-    /**
-    * Position of offsethigh in group information array
-    */
-    private static final int OFFSET_HIGH_OFFSET_ = 1;
-      
-    /**
-    * Position of offsetlow in group information array
-    */
-    private static final int OFFSET_LOW_OFFSET_ = 2;
-    /**
-    * Double nibble indicator, any nibble > this number has to be combined
-    * with its following nibble
-    */
-    private static final int SINGLE_NIBBLE_MAX_ = 11;
-      
-    // private methods ---------------------------------------------------
-      
-    /**
-    * Gets the algorithmic name for the argument character
-    * @param ch character to determine name for
-    * @param choice name choice
-    * @return the algorithmic name or null if not found
-    */
-    private String getAlgName(int ch, int choice) 
-    {
-    	// Do not write algorithmic Unicode 1.0 names because Unihan names are 
-        // the same as the modern ones, extension A was only introduced with 
-        // Unicode 3.0, and the Hangul syllable block was moved and changed 
-        // around Unicode 1.1.5.
-        if (choice != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) {
-       	 	// index in terms integer index
-        	StringBuffer s = new StringBuffer();
-        
-        	for (int index = m_algorithm_.length - 1; index >= 0; index --) {
-         	   if (m_algorithm_[index].contains(ch)) {
-          	      if (index >= 0) {
-           	 	      m_algorithm_[index].appendName(ch, s);
-            	      return s.toString();
-             	   }
-         	   }
-            }
-        }
-        return null;
-    }
-      
-    /**
-    * Getting the character with the tokenized argument name
-    * @param name of the character
-    * @return character with the tokenized argument name or -1 if character
-    *         is not found
-    */
-    private int getGroupChar(String name, int choice) 
-    {
-        int result = 0;
-        
-        for (int i = 0; i < m_groupcount_; i ++) {
-            result = getGroupChar(i, name, choice);
-            if (result != -1) {
-                return result;
-            }
-        }
-        return -1;
-    }
-      
    /**
    * Reads a block of compressed lengths of 32 strings and expands them into 
    * offsets and lengths for each string. Lengths are stored with a 
@ -664,7 +574,7 @@ final class UCharacterName
    * @return next index of the data string immediately after the lengths 
    *         in terms of byte address
    */
-    private int getGroupLengths(int index, char offsets[], char lengths[]) 
+    protected int getGroupLengths(int index, char offsets[], char lengths[]) 
    {
        char length = 0xffff;
        byte b = 0,
@ -687,22 +597,22 @@ final class UCharacterName
                // getting nibble
                n = (byte)((b >> shift) & 0x0F);   
                if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) {
-                length = (char)((n - 12) << 4);
+                	length = (char)((n - 12) << 4);
                }
                else {
-                if (length != 0xffff) {
-                    lengths[i] = (char)((length | n) + 12);
-                }
-                else {
-                    lengths[i] = (char)n;
-                }
+                	if (length != 0xffff) {
+                 	   lengths[i] = (char)((length | n) + 12);
+                	}
+                	else {
+                 	   lengths[i] = (char)n;
+                	}
                    
-                if (i < LINES_PER_GROUP_) {
-                    offsets[i + 1] = (char)(offsets[i] + lengths[i]);
-                }
+                	if (i < LINES_PER_GROUP_) {
+                 	   offsets[i + 1] = (char)(offsets[i] + lengths[i]);
+                	}
                    
-                length = 0xffff;
-                i ++;
+                	length = 0xffff;
+                	i ++;
                }
                      
                shift -= 4;
@ -710,7 +620,7 @@ final class UCharacterName
        }
        return stringoffset;
    }
-      
+    
    /**
    * Gets the name of the argument group index
    * @param index of the group name string in byte count
@ -718,13 +628,13 @@ final class UCharacterName
    * @param choice of Unicode 1.0 name or the most current name
    * @return name of the group 
    */
-    private String getGroupName(int index, int length, int choice) 
+    protected String getGroupName(int index, int length, int choice) 
    {
        if (choice == UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) {
-            int oldindex = index;
-            index += UCharacterUtil.skipByteSubString(m_groupstring_, index, 
-                                                      length, (byte)';');
-            length -= (index - oldindex);
+        	int oldindex = index;
+         	index += UCharacterUtil.skipByteSubString(m_groupstring_, 
+         		                               index, length, (byte)';');   
+         	length -= (index - oldindex);
        }
        
        StringBuffer s = new StringBuffer();
@ -736,7 +646,7 @@ final class UCharacterName
              
            if (b >= m_tokentable_.length) {
                if (b == ';') {
-                break;
+                	break;
                }
                s.append(b); // implicit letter
            }
@ -750,6 +660,13 @@ final class UCharacterName
                }
                if (token == 0xFFFF) {
                    if (b == ';') {
+                    	// skip the semicolon if we are seeking extended 
+                    	// names and there was no 2.0 name but there
+                        // is a 1.0 name.
+                    	if (s.length() == 0 && choice == 
+                    	       UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
+                        	continue;
+                    	}
                        break;
                    }
                    s.append((char)(b & 0x00ff)); // explicit letter
@ -766,6 +683,300 @@ final class UCharacterName
        }
        return s.toString();
    }
+    
+    /**
+    * Retrieves the extended name
+    */
+    protected String getExtendedName(int ch) 
+    {    
+        String result = getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);    
+        if (result == null) {        
+            if (getType(ch) == UCharacterCategory.CONTROL) {            
+                result = getName(ch, 
+                                 UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);        
+            }        
+            if (result == null) {            
+                result = getExtendedOr10Name(ch);
+            }
+        }    
+        return result;
+    }
+    
+    /**
+     * Gets the group index for the codepoint, or the group before it.
+     * @param codepoint
+     * @return group index containing codepoint or the group before it.
+     */
+    protected int getGroup(int codepoint)
+    {
+    	int endGroup = m_groupcount_;
+    	int msb      = getCodepointMSB(codepoint);
+        int result   = 0;    
+        // binary search for the group of names that contains the one for 
+        // code
+        // find the group that contains codepoint, or the highest before it
+        while (result < endGroup - 1) {
+            int gindex = (result + endGroup) >> 1;
+            if (msb < getGroupMSB(gindex)) {
+               	endGroup = gindex;
+            }
+            else {
+               	result = gindex;
+            }
+        }
+        return result;
+    }
+    
+    /**
+     * Gets the extended and 1.0 name when the most current unicode names
+     * fail
+     * @param ch codepoint
+     * @return name of codepoint extended or 1.0
+     */
+    protected String getExtendedOr10Name(int ch)
+    {
+    	String result = null;
+    	if (getType(ch) == UCharacterCategory.CONTROL) {            
+            result = getName(ch, 
+                             UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);        
+        }        
+        if (result == null) {            
+            int type = getType(ch);    
+            // Return unknown if the table of names above is not up to 
+            // date.
+            if (type >= UCharacterCategory.TYPE_NAMES_.length) {       
+                result = UCharacterCategory.UNKNOWN_TYPE_NAME_;    
+            } 
+            else {        
+                result = UCharacterCategory.TYPE_NAMES_[type];    
+            }
+            StringBuffer tempResult = new StringBuffer(result);
+            tempResult.insert(0, '<');
+            tempResult.append('-');
+            String chStr = Integer.toHexString(ch).toUpperCase();
+            int zeros = 4 - chStr.length();
+            while (zeros > 0) {
+                tempResult.append('0');
+                zeros --;
+            }
+            tempResult.append(chStr);
+            tempResult.append('>');
+            result = tempResult.toString();
+        }
+        return result;
+    }
+    
+    // these are all UCharacterNameIterator use methods -------------------
+    
+    /**
+     * Gets the MSB from the group index
+     * @param gindex group index
+     * @return the MSB of the group if gindex is valid, -1 otherwise
+     */
+    protected int getGroupMSB(int gindex)
+    {
+    	if (gindex >= m_groupcount_) {
+    		return -1;
+    	}
+    	return m_groupinfo_[gindex * m_groupsize_];
+    }
+    
+    /**
+     * Gets the MSB of the codepoint
+     * @param codepoint 
+     * @return the MSB of the codepoint
+     */
+    protected int getCodepointMSB(int codepoint)
+    {
+    	return codepoint >> GROUP_SHIFT_;
+    }
+    
+    /**
+     * Gets the maximum codepoint + 1 of the group
+     * @param msb most significant byte of the group
+     * @return limit codepoint of the group
+     */
+    protected int getGroupLimit(int msb)
+    {
+    	return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_;
+    }
+    
+    /**
+     * Gets the minimum codepoint of the group
+     * @param msb most significant byte of the group
+     * @return minimum codepoint of the group
+     */
+    protected int getGroupMin(int msb)
+    {
+    	return msb << GROUP_SHIFT_;
+    }
+    
+    /**
+     * Gets the offset to a group
+     * @param codepoint 
+     * @return offset to a group
+     */
+    protected int getGroupOffset(int codepoint)
+    {
+    	return codepoint & GROUP_MASK_;
+    }
+
+	/**
+     * Gets the minimum codepoint of a group
+     * @param codepoint
+     * @return minimum codepoint in the group which codepoint belongs to
+     */
+    protected int getGroupMinFromCodepoint(int codepoint)
+    {
+    	return codepoint & ~GROUP_MASK_;
+    }
+    
+    /**
+     * Get the Algorithm range length 
+     * @return Algorithm range length
+     */
+    protected int getAlgorithmLength()
+    {
+    	return m_algorithm_.length;
+    }
+        
+    /**
+     * Gets the start of the range
+     * @param index algorithm index
+     * @return algorithm range start
+     */
+    protected int getAlgorithmStart(int index)
+    {
+      	return m_algorithm_[index].m_rangestart_;
+    }
+        
+    /**
+     * Gets the end of the range
+     * @param index algorithm index
+     * @return algorithm range end
+     */
+    protected int getAlgorithmEnd(int index)
+    {
+      	return m_algorithm_[index].m_rangeend_;
+    }
+    
+    /**
+     * Gets the Algorithmic name of the codepoint
+     * @param index algorithmic range index
+     * @param codepoint 
+     * @return algorithmic name of codepoint
+     */
+    protected String getAlgorithmName(int index, int codepoint) 
+    {
+    	StringBuffer result = new StringBuffer();
+    	m_algorithm_[index].appendName(codepoint, result);
+        return result.toString();
+    }
+    
+        
+    // private data members ----------------------------------------------
+    
+    /**
+    * Data used in unames.dat
+    */
+    private char m_tokentable_[];
+    private byte m_tokenstring_[];
+    private char m_groupinfo_[];
+    private byte m_groupstring_[];
+    private AlgorithmName m_algorithm_[];
+      
+    /**
+    * Group use
+    */
+    private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1];
+    private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1];
+      	 
+    /**
+    * Default name of the name datafile
+    */
+    private static final String NAME_FILE_NAME_ = 
+                                           "/com/ibm/icu/impl/data/unames.dat";
+    /**
+    * Shift count to retrieve group information
+    */
+    private static final int GROUP_SHIFT_ = 5;
+    /**
+    * Mask to retrieve the offset for a particular character within a group
+    */
+    private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
+    /**
+    * Default buffer size of datafile
+    */
+    private static final int NAME_BUFFER_SIZE_ = 100000;
+      
+    /**
+    * Position of offsethigh in group information array
+    */
+    private static final int OFFSET_HIGH_OFFSET_ = 1;
+      
+    /**
+    * Position of offsetlow in group information array
+    */
+    private static final int OFFSET_LOW_OFFSET_ = 2;
+    /**
+    * Double nibble indicator, any nibble > this number has to be combined
+    * with its following nibble
+    */
+    private static final int SINGLE_NIBBLE_MAX_ = 11;
+     
+      
+    // private methods ---------------------------------------------------
+      
+    /**
+    * Gets the algorithmic name for the argument character
+    * @param ch character to determine name for
+    * @param choice name choice
+    * @return the algorithmic name or null if not found
+    */
+    private String getAlgName(int ch, int choice) 
+    {
+    	// Do not write algorithmic Unicode 1.0 names because Unihan names are 
+        // the same as the modern ones, extension A was only introduced with 
+        // Unicode 3.0, and the Hangul syllable block was moved and changed 
+        // around Unicode 1.1.5.
+        if (choice != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) {
+       	 	// index in terms integer index
+        	StringBuffer s = new StringBuffer();
+        
+        	for (int index = m_algorithm_.length - 1; index >= 0; index --) {
+         	   if (m_algorithm_[index].contains(ch)) {
+          	      m_algorithm_[index].appendName(ch, s);
+            	  return s.toString();
+         	   }
+            }
+        }
+        return null;
+    }
+      
+    /**
+    * Getting the character with the tokenized argument name
+    * @param name of the character
+    * @return character with the tokenized argument name or -1 if character
+    *         is not found
+    */
+    private synchronized int getGroupChar(String name, int choice) 
+    {
+    	for (int i = 0; i < m_groupcount_; i ++) {
+        	// populating the data set of grouptable
+        	
+        	int startgpstrindex = getGroupLengths(i, m_groupoffsets_, 
+                                                  m_grouplengths_);
+          
+        	// shift out to function
+        	int result = getGroupChar(startgpstrindex, m_grouplengths_, name, 
+        	                          choice);
+        	if (result != -1) {
+            	return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_) 
+            	         | result;
+        	}
+        }
+        return -1;
+    }
      
    /**
    * Compares and retrieve character if name is found within the argument 
@ -884,62 +1095,45 @@ final class UCharacterName
    * @param ch character to get the group name 
    * @param choice name choice selector to choose a unicode 1.0 or newer name
    */
-    private String getGroupName(int ch, int choice) 
+    private synchronized String getGroupName(int ch, int choice) 
    {            
        // gets the msb
-        int msb = ch >> GROUP_SHIFT_,
-            end = m_groupcount_,
-            start,
-            gindex = 0;
-        
-        // binary search for the group of names that contains the one for 
-        // code
-        for (start = 0; start < end - 1;) {
-            gindex = (start + end) >> 1;
-            if (msb < m_groupinfo_[gindex * m_groupsize_]) {
-                end = gindex;
-            }
-            else {
-                start = gindex;
-            }
-        }
+        int msb   = getCodepointMSB(ch);
+        int group = getGroup(ch);

        // return this if it is an exact match
-        if (msb == m_groupinfo_[start * m_groupsize_]) {
-            char offsets[] = new char[LINES_PER_GROUP_ + 1];
-            char lengths[] = new char[LINES_PER_GROUP_ + 1];
-                        
-            int index = getGroupLengths(start, offsets, lengths);
+        if (msb == m_groupinfo_[group * m_groupsize_]) {
+            int index = getGroupLengths(group, m_groupoffsets_, 
+                                        m_grouplengths_);
            int offset = ch & GROUP_MASK_;
-            return getGroupName(index + offsets[offset], lengths[offset], 
-                                choice);
+            return getGroupName(index + m_groupoffsets_[offset], 
+                                m_grouplengths_[offset], choice);
        }
        
        return null;
    }
-      
+    
    /**
-    * Getting the character with the tokenized argument name
-    * @param index of the group to check
-    * @param name of the character
-    * @param choice of Unicode version used
-    * @return character with the tokenized argument name or -1 if character 
-    *         is not found
+    * Gets the character extended type
+    * @param ch character to be tested
+    * @return extended type it is associated with
    */
-    private int getGroupChar(int index, String name, int choice) 
+    private int getType(int ch)
    {
-        // populating the data set of grouptable
-        char offsets[] = new char[LINES_PER_GROUP_ + 1];
-        char lengths[] = new char[LINES_PER_GROUP_ + 1];
-        int startgpstrindex = getGroupLengths(index, offsets, lengths);
-          
-        // shift out to function
-        int result = getGroupChar(startgpstrindex, lengths, name, choice);
-        if (result != -1) {
-            return (m_groupinfo_[index * m_groupsize_] << GROUP_SHIFT_) | 
-                   result;
-        }
-        return -1;
+        if (UCharacter.isNonCharacter(ch)) {  
+            // not a character we return a invalid category count
+            return UCharacterCategory.NON_CHARACTER_;    
+        }    
+        int result = UCharacter.getType(ch);
+        if (result == UCharacterCategory.SURROGATE) {            
+            if (ch <= UnicodeProperty.LEAD_SURROGATE_MAX_VALUE) {
+                result = UCharacterCategory.LEAD_SURROGATE_;
+            }
+            else {
+                result = UCharacterCategory.TRAIL_SURROGATE_;
+            }    
+        }    
+        return result;
    }
    
    /**
@ -987,65 +1181,4 @@ final class UCharacterName
        }    
        return -2;
    }
-    
-    /**
-    * Gets the character extended type
-    * @param ch character to be tested
-    * @return extended type it is associated with
-    */
-    private int getType(int ch)
-    {
-        if (UCharacter.isNonCharacter(ch)) {  
-            // not a character we return a invalid category count
-            return UCharacterCategory.NON_CHARACTER_;    
-        }    
-        int result = UCharacter.getType(ch);
-        if (result == UCharacterCategory.SURROGATE) {            
-            if (ch <= UnicodeProperty.LEAD_SURROGATE_MAX_VALUE) {
-                result = UCharacterCategory.LEAD_SURROGATE_;
-            }
-            else {
-                result = UCharacterCategory.TRAIL_SURROGATE_;
-            }    
-        }    
-        return result;
-    }
-    
-    /**
-    * Retrieves the extended name
-    */
-    private String getExtendedName(int ch) 
-    {    
-        String result = getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);    
-        if (result == null) {        
-            if (getType(ch) == UCharacterCategory.CONTROL) {            
-                result = getName(ch, 
-                                 UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);        
-            }        
-            if (result == null) {            
-                int type = getType(ch);    
-                // Return unknown if the table of names above is not up to 
-                // date.
-                if (type >= UCharacterCategory.TYPE_NAMES_.length) {       
-                    result = UCharacterCategory.UNKNOWN_TYPE_NAME_;    
-                } 
-                else {        
-                    result = UCharacterCategory.TYPE_NAMES_[type];    
-                }
-                StringBuffer tempResult = new StringBuffer(result);
-                tempResult.insert(0, '<');
-                tempResult.append('-');
-                String chStr = Integer.toHexString(ch).toUpperCase();
-                int zeros = 4 - chStr.length();
-                while (zeros > 0) {
-                    tempResult.append('0');
-                    zeros --;
-                }
-                tempResult.append(chStr);
-                tempResult.append('>');
-                result = tempResult.toString();
-            }
-        }    
-        return result;
-    }
 }
--- a/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java
+++ b/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java
@ -0,0 +1,313 @@
+/*
+******************************************************************************
+* Copyright (C) 1996-2002, International Business Machines Corporation and   *
+* others. All Rights Reserved.                                               *
+******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacterNameIterator.java,v $
+* $Date: 2002/03/08 02:04:00 $
+* $Revision: 1.1 $
+*
+******************************************************************************
+*/
+
+package com.ibm.icu.lang;
+
+import com.ibm.icu.util.ValueIterator;
+
+/**
+ * Class enabling iteration of the codepoints and their names.
+ * Result of each iteration contains a valid codepoints that have the result 
+ * name.
+ * See UCharacter.getNameIterator() for an example of use.
+ * @author synwee
+ * @since release 2.1, March 5 2002
+ */
+class UCharacterNameIterator implements ValueIterator
+{
+	// public methods ----------------------------------------------------
+	
+	/**
+    * <p>Gets the next result for this iteration and returns 
+    * true if we are not at the end of the iteration, false otherwise.</p>
+    * <p>If the return boolean is a false, the contents of elements will not
+    * be updated.</p>
+    * @param element for storing the result range and value
+    * @return true if we are not at the end of the iteration, false otherwise.
+    * @see Element
+    * @draft 2.1
+    */
+    public boolean next(ValueIterator.Element element)
+    {
+    	if (m_current_ >= m_limit_) {
+    		return false;
+    	}
+    	
+    	if (m_choice_ != UCharacterNameChoice.U_UNICODE_10_CHAR_NAME) {
+    		int length = m_name_.getAlgorithmLength();
+    		if (m_algorithmIndex_ < length) {
+    			while (m_algorithmIndex_ < length) {
+    				// find the algorithm range that could contain m_current_ 
+    				if (m_algorithmIndex_ < 0 ||
+    				    m_name_.getAlgorithmEnd(m_algorithmIndex_) < 
+    				    m_current_) {
+    				 	m_algorithmIndex_ ++;   
+    				}
+    				else {
+    					break;
+    				}
+    			}
+    			
+    			if (m_algorithmIndex_ < length) {
+    				// interleave the data-driven ones with the algorithmic ones
+   					// iterate over all algorithmic ranges; assume that they are 
+   					// in ascending order
+   					int start = m_name_.getAlgorithmStart(m_algorithmIndex_);
+   					if (m_current_ < start) {
+   						// this should get rid of those codepoints that are not
+   						// in the algorithmic range
+   						int end = start;
+        		    	if (m_limit_ <= start) {
+         		 	      	end = m_limit_;
+          		  		}
+            			if (!iterateGroup(element, end)) {
+            				m_current_ ++;
+              	 		 	return true;
+            			}	
+        			}
+        	
+        			if (m_current_ >= m_limit_) {
+        				// after iterateGroup fails, current codepoint may be
+        				// greater than limit
+        				return false;
+        			}
+        			
+        			element.integer = m_current_;
+        			element.value   = m_name_.getAlgorithmName(m_algorithmIndex_, 
+        		 	                                               m_current_);
+        			// reset the group index if we are in the algorithmic names                       
+        			m_groupIndex_ = -1; 
+        			m_current_ ++;
+        			return true;
+    			}
+    		}
+    	}
+    	// enumerate the character names after the last algorithmic range 
+    	if (!iterateGroup(element, m_limit_)) {
+    		m_current_ ++;
+    		return true;
+    	}
+    	else if (m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
+    		if (!iterateExtended(element, m_limit_)) {
+    			m_current_ ++;
+    			return true;
+    		}
+    	}
+    	
+		return false;
+    }
+    
+    /**
+    * Resets the iterator to the beginning of the iteration.
+    * @draft 2.1
+    */
+    public void reset()
+    {
+    	m_current_        = m_start_;
+    	m_groupIndex_     = -1;
+    	m_algorithmIndex_ = -1;
+    }
+    
+    /**
+     * Sets the range for iteration
+     * @param start first codepoint to iterate
+     * @param limit one codepoint after the last codepoint to iterate
+     * @exception IllegalArgumentException thrown when start or limit exceed
+     *            the Unicode codepoint bounds or when start > limit.
+     */
+    public void setRange(int start, int limit)
+    {
+    	if (start > limit || start < UCharacter.MIN_VALUE || 
+    	    limit > UCharacter.MAX_VALUE + 1) {
+    	    throw new IllegalArgumentException(
+    	        "start or limit has to be valid Unicode codepoints and start <= limit");
+    	}
+    	m_start_   = start;
+    	m_limit_   = limit;
+    	m_current_ = start;
+    }
+    
+	// protected constructor ---------------------------------------------
+    
+    /**
+    * Constructor
+    * @param name name data
+    * @param choice name choice from the class 
+    *               com.ibm.icu.lang.UCharacterNameChoice
+    * @draft 2.1
+    */
+    protected UCharacterNameIterator(UCharacterName name, int choice)
+    {
+    	m_name_    = name;
+    	// no explicit choice in UCharacter so no checks on choice
+     	m_choice_  = choice;
+    	m_start_   = UCharacter.MIN_VALUE;
+    	m_limit_   = UCharacter.MAX_VALUE + 1;
+    	m_current_ = m_start_;
+    }
+    
+    // private data members ---------------------------------------------
+  	
+  	/**
+  	 * Name data
+  	 */
+  	private UCharacterName m_name_;
+  	/**
+  	 * Name choice
+  	 */
+  	private int m_choice_;
+  	 /**
+     * Start iteration range
+     */
+    private int m_start_;
+    /**
+     * End + 1 iteration range
+     */
+    private int m_limit_;
+    /**
+     * Current codepoint
+     */
+    private int m_current_;
+    /**
+     * Group index
+     */
+    private int m_groupIndex_ = -1;
+    /**
+     * Algorithm index
+     */
+    private int m_algorithmIndex_ = -1;
+    /**
+    * Group use
+    */
+    private static char GROUP_OFFSETS_[] = 
+                                new char[UCharacterName.LINES_PER_GROUP_ + 1];
+    private static char GROUP_LENGTHS_[] = 
+                                new char[UCharacterName.LINES_PER_GROUP_ + 1];
+    
+    // private methods --------------------------------------------------
+    
+    /**
+     * Group name iteration, iterate all the names in the current 32-group and 
+     * returns the first codepoint that has a valid name.
+ 	 * @param result stores the result codepoint and name
+ 	 * @param limit last codepoint + 1 in range to search
+ 	 * @return false if a codepoint with a name is found in group and we can
+ 	 *         bail from further iteration, true to continue on with the 
+ 	 *         iteration
+     */
+    private boolean iterateSingleGroup(Element result, int limit) 
+    {
+    	synchronized(GROUP_OFFSETS_) {
+    	synchronized(GROUP_LENGTHS_) {
+ 			int index = m_name_.getGroupLengths(m_groupIndex_, GROUP_OFFSETS_, 
+    		                                    GROUP_LENGTHS_);
+        	while (m_current_ < limit) {
+        		int    offset = m_name_.getGroupOffset(m_current_);
+        		String name   = m_name_.getGroupName(
+        		                          index + GROUP_OFFSETS_[offset], 
+        	 	                          GROUP_LENGTHS_[offset], m_choice_);
+        		if ((name == null || name.length() == 0) && 
+          	 		m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
+           			name = m_name_.getExtendedName(m_current_);
+        		}
+        		if (name != null && name.length() > 0) {
+        			result.integer = m_current_;
+        			result.value   = name;
+          	  	    return false;
+            	}
+            	++ m_current_;
+        	}
+    	}
+    	}
+    	return true;
+	}
+	
+	/**
+     * Group name iteration, iterate all the names in the current 32-group and 
+     * returns the first codepoint that has a valid name.
+ 	 * @param result stores the result codepoint and name
+ 	 * @param limit last codepoint + 1 in range to search
+ 	 * @return false if a codepoint with a name is found in group and we can
+ 	 *         bail from further iteration, true to continue on with the 
+ 	 *         iteration
+     */
+    private boolean iterateGroup(Element result, int limit)
+    {
+    	if (m_groupIndex_ < 0) {
+    		m_groupIndex_ = m_name_.getGroup(m_current_);
+    	}
+    	
+    	while (m_groupIndex_ < m_name_.m_groupcount_ &&
+    	       m_current_ < limit) { 
+    		// iterate till the last group or the last codepoint
+    		int startMSB = m_name_.getCodepointMSB(m_current_);
+    		int gMSB     = m_name_.getGroupMSB(m_groupIndex_); // can be -1
+    		if (startMSB == gMSB) {
+    			if (startMSB == m_name_.getCodepointMSB(limit - 1)) {
+    				// if start and limit - 1 are in the same group, then enumerate 
+    				// only in that one
+    				return iterateSingleGroup(result, limit);
+    			}
+    			// enumerate characters in the partial start group
+           		// if (m_name_.getGroupOffset(m_current_) != 0) {
+               	if (!iterateSingleGroup(result, 
+                           	            m_name_.getGroupLimit(gMSB))) {
+                	return false;
+                }
+               	++ m_groupIndex_; // continue with the next group
+    		}	
+       		else if (startMSB > gMSB) {
+           			// make sure that we start enumerating with the first group 
+           			// after start 
+           			m_groupIndex_ ++;
+       		}
+       		else { 
+       			int gMIN = m_name_.getGroupMin(gMSB);
+       			if (gMIN > limit) {
+               		gMIN = limit;
+            	}
+       			if (m_choice_ == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
+					if (!iterateExtended(result, gMIN)) {
+		 				return false;
+					}
+	  			}
+	  			m_current_ = gMIN;
+        	}
+    	}
+        
+		return true;
+	}
+	
+	/**
+ 	 * Iterate extended names.
+ 	 * @param result stores the result codepoint and name
+ 	 * @param limit last codepoint + 1 in range to search
+ 	 * @return false if a codepoint with a name is found and we can
+ 	 *         bail from further iteration, true to continue on with the 
+ 	 *         iteration (this will always be false for valid codepoints)
+ 	 */ 
+	private boolean iterateExtended(UCharacterNameIterator.Element result,
+	                                int limit)
+	{
+        while (m_current_ < limit) {
+        	String name = m_name_.getExtendedOr10Name(m_current_);
+        	if (name != null && name.length() > 0) {
+                result.integer = m_current_;
+        		result.value   = name;
+    			return false;
+            }
+            ++ m_current_;
+        }
+    	return true;
+  	}
+}
--- a/icu4j/src/com/ibm/icu/lang/UCharacterNameReader.java
+++ b/icu4j/src/com/ibm/icu/lang/UCharacterNameReader.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/Attic/UCharacterNameReader.java,v $ 
-* $Date: 2002/02/28 23:42:04 $ 
-* $Revision: 1.4 $
+* $Date: 2002/03/08 02:04:00 $ 
+* $Revision: 1.5 $
 *
 *******************************************************************************
 */
@ -92,7 +92,19 @@ final class UCharacterNameReader
        
        size = m_algnamesindex_ - m_groupstringindex_;
        byte groupstring[] = new byte[size];
+        System.out.println("size " + size);
        m_dataInputStream_.readFully(groupstring);
+        for (int i = 0; i < size; i ++) {
+        	if (groupstring[i] == 0x14 &&
+        	    groupstring[i + 1] == 0x12 &&
+        	    groupstring[i + 2] == 0x3e &&
+        	    groupstring[i + 3] == 0x01 &&
+        	    groupstring[i + 4] == 0x39 &&
+        	    groupstring[i + 5] == 0x4 &&
+        	    groupstring[i + 6] == 0x1e)
+        	    System.out.println("found at " + i);
+        }
+        	
        data.setGroup(group, groupstring);
        
        count = m_dataInputStream_.readInt();