Unicode 3.0 character property system code check in

X-SVN-Rev: 3324
This commit is contained in:
Syn Wee Quek 2000-12-26 20:01:08 +00:00
parent 22a81eddc3
commit bc654bbe86
16 changed files with 6144 additions and 0 deletions

View file

@ -0,0 +1,313 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java,v $
* $Date: 2000/12/26 20:01:08 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.test.text;
import com.ibm.icu.text.UCharacter;
import com.ibm.icu.text.UCharacterCategoryEnum;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.Hashtable;
import java.util.Enumeration;
/**
* A class to compare the difference in methods between java.lang.Character and
* UCharacter
* @author Syn Wee Quek
* @since oct 06 2000
* @see com.ibm.icu.text.UCharacter
*/
public final class UCharacterCompare
{
// private variables ================================================
private static Hashtable m_hashtable_ = new Hashtable();
// public methods ======================================================
/**
* Main testing method
*/
public static void main(String arg[])
{
try
{
FileWriter f;
if (arg.length == 0)
f = new FileWriter("compare.txt");
else
f = new FileWriter(arg[0]);
PrintWriter p = new PrintWriter(f);
p.print("char character name ");
p.println("method name ucharacter character");
for (char i = Character.MIN_VALUE; i < Character.MAX_VALUE; i ++)
{
if (UCharacter.isDefined(i) != Character.isDefined(i))
trackDifference(p, i, "isDefined()", "" + UCharacter.isDefined(i),
"" + Character.isDefined(i));
else
{
if (UCharacter.digit(i, 10) != Character.digit(i, 10))
trackDifference(p, i, "digit()", "" + UCharacter.digit(i, 10),
"" + Character.digit(i, 10));
if (UCharacter.getNumericValue(i) != Character.getNumericValue(i))
trackDifference(p, i, "getNumericValue()",
"" + UCharacter.getNumericValue(i),
"" + Character.getNumericValue(i));
if (!compareType(UCharacter.getType(i), Character.getType(i)))
trackDifference(p, i, "getType()", "" + UCharacter.getType(i),
"" + Character.getType(i));
if (UCharacter.isDigit(i) != Character.isDigit(i))
trackDifference(p, i, "isDigit()",
"" + UCharacter.isDigit(i),
"" + Character.isDigit(i));
if (UCharacter.isISOControl(i) != Character.isISOControl(i))
trackDifference(p, i, "isISOControl()",
"" + UCharacter.isISOControl(i),
"" + Character.isISOControl(i));
if (UCharacter.isLetter(i) != Character.isLetter(i))
trackDifference(p, i, "isLetter()", "" + UCharacter.isLetter(i),
"" + Character.isLetter(i));
if (UCharacter.isLetterOrDigit(i) != Character.isLetterOrDigit(i))
trackDifference(p, i, "isLetterOrDigit()",
"" + UCharacter.isLetterOrDigit(i),
"" + Character.isLetterOrDigit(i));
if (UCharacter.isLowerCase(i) != Character.isLowerCase(i))
trackDifference(p, i, "isLowerCase()",
"" + UCharacter.isLowerCase(i),
"" + Character.isLowerCase(i));
if (UCharacter.isWhitespace(i) != Character.isWhitespace(i))
trackDifference(p, i, "isWhitespace()",
"" + UCharacter.isWhitespace(i),
"" + Character.isWhitespace(i));
if (UCharacter.isSpaceChar(i) != Character.isSpaceChar(i))
trackDifference(p, i, "isSpaceChar()",
"" + UCharacter.isSpaceChar(i),
"" + Character.isSpaceChar(i));
if (UCharacter.isTitleCase(i) != Character.isTitleCase(i))
trackDifference(p, i, "isTitleChar()",
"" + UCharacter.isTitleCase(i),
"" + Character.isTitleCase(i));
if (UCharacter.isUnicodeIdentifierPart(i) !=
Character.isUnicodeIdentifierPart(i))
trackDifference(p, i, "isUnicodeIdentifierPart()",
"" + UCharacter.isUnicodeIdentifierPart(i),
"" + Character.isUnicodeIdentifierPart(i));
if (UCharacter.isUnicodeIdentifierStart(i) !=
Character.isUnicodeIdentifierStart(i))
trackDifference(p, i, "isUnicodeIdentifierStart()",
"" + UCharacter.isUnicodeIdentifierStart(i),
"" + Character.isUnicodeIdentifierStart(i));
if (UCharacter.isIdentifierIgnorable(i) !=
Character.isIdentifierIgnorable(i))
trackDifference(p, i, "isIdentifierIgnorable()",
"" + UCharacter.isIdentifierIgnorable(i),
"" + Character.isIdentifierIgnorable(i));
if (UCharacter.isUpperCase(i) != Character.isUpperCase(i))
trackDifference(p, i, "isUpperCase()",
"" + UCharacter.isUpperCase(i),
"" + Character.isUpperCase(i));
if (UCharacter.toLowerCase(i) != Character.toLowerCase(i))
trackDifference(p, i, "toLowerCase()",
Integer.toHexString(UCharacter.toLowerCase(i)),
Integer.toHexString(Character.toLowerCase(i)));
if (!UCharacter.toString(i).equals(new Character(i).toString()))
trackDifference(p, i, "toString()",
UCharacter.toString(i),
new Character(i).toString());
if (UCharacter.toTitleCase(i) != Character.toTitleCase(i))
trackDifference(p, i, "toTitleCase()",
Integer.toHexString(UCharacter.toTitleCase(i)),
Integer.toHexString(Character.toTitleCase(i)));
if (UCharacter.toUpperCase(i) != Character.toUpperCase(i))
trackDifference(p, i, "toUpperCase()",
Integer.toHexString(UCharacter.toUpperCase(i)),
Integer.toHexString(Character.toUpperCase(i)));
}
}
summary(p);
p.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
// private methods ===================================================
/**
* Comparing types
* @param uchartype UCharacter type
* @param jchartype java.lang.Character type
*/
private static boolean compareType(int uchartype, int jchartype)
{
if (uchartype == UCharacterCategoryEnum.UNASSIGNED &&
jchartype == Character.UNASSIGNED)
return true;
if (uchartype == UCharacterCategoryEnum.UPPERCASE_LETTER &&
jchartype == Character.UPPERCASE_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.LOWERCASE_LETTER &&
jchartype == Character.LOWERCASE_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.TITLECASE_LETTER &&
jchartype == Character.TITLECASE_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.MODIFIER_LETTER &&
jchartype == Character.MODIFIER_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.OTHER_LETTER &&
jchartype == Character.OTHER_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.NON_SPACING_MARK &&
jchartype == Character.NON_SPACING_MARK)
return true;
if (uchartype == UCharacterCategoryEnum.ENCLOSING_MARK &&
jchartype == Character.ENCLOSING_MARK)
return true;
if (uchartype == UCharacterCategoryEnum.COMBINING_SPACING_MARK &&
jchartype == Character.COMBINING_SPACING_MARK)
return true;
if (uchartype == UCharacterCategoryEnum.DECIMAL_DIGIT_NUMBER &&
jchartype == Character.DECIMAL_DIGIT_NUMBER)
return true;
if (uchartype == UCharacterCategoryEnum.LETTER_NUMBER &&
jchartype == Character.LETTER_NUMBER)
return true;
if (uchartype == UCharacterCategoryEnum.OTHER_NUMBER &&
jchartype == Character.OTHER_NUMBER)
return true;
if (uchartype == UCharacterCategoryEnum.SPACE_SEPARATOR &&
jchartype == Character.SPACE_SEPARATOR)
return true;
if (uchartype == UCharacterCategoryEnum.LINE_SEPARATOR &&
jchartype == Character.LINE_SEPARATOR)
return true;
if (uchartype == UCharacterCategoryEnum.PARAGRAPH_SEPARATOR &&
jchartype == Character.PARAGRAPH_SEPARATOR)
return true;
if (uchartype == UCharacterCategoryEnum.CONTROL &&
jchartype == Character.CONTROL)
return true;
if (uchartype == UCharacterCategoryEnum.FORMAT &&
jchartype == Character.FORMAT)
return true;
if (uchartype == UCharacterCategoryEnum.PRIVATE_USE &&
jchartype == Character.PRIVATE_USE)
return true;
if (uchartype == UCharacterCategoryEnum.SURROGATE &&
jchartype == Character.SURROGATE)
return true;
if (uchartype == UCharacterCategoryEnum.DASH_PUNCTUATION &&
jchartype == Character.DASH_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.START_PUNCTUATION &&
jchartype == Character.START_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.END_PUNCTUATION &&
jchartype == Character.END_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.CONNECTOR_PUNCTUATION &&
jchartype == Character.CONNECTOR_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.OTHER_PUNCTUATION &&
jchartype == Character.OTHER_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.MATH_SYMBOL &&
jchartype == Character.MATH_SYMBOL)
return true;
if (uchartype == UCharacterCategoryEnum.CURRENCY_SYMBOL &&
jchartype == Character.CURRENCY_SYMBOL)
return true;
if (uchartype == UCharacterCategoryEnum.MODIFIER_SYMBOL &&
jchartype == Character.MODIFIER_SYMBOL)
return true;
if (uchartype == UCharacterCategoryEnum.OTHER_SYMBOL &&
jchartype == Character.OTHER_SYMBOL)
return true;
if (uchartype == UCharacterCategoryEnum.INITIAL_PUNCTUATION &&
jchartype == Character.START_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.FINAL_PUNCTUATION &&
jchartype == Character.END_PUNCTUATION)
return true;
/*if (uchartype == UCharacterCategoryEnum.GENERAL_OTHER_TYPES &&
jchartype == Character.GENERAL_OTHER_TYPES)
return true;*/
return false;
}
/**
* Difference writing to file
* @param f file outputstream
* @param ch code point
* @param method for testing
* @param ucharval UCharacter value after running method
* @param charval Character value after running method
* @exception thrown when error occur in writing to file
*/
private static void trackDifference(PrintWriter f, int ch, String method,
String ucharval, String charval)
throws Exception
{
if (m_hashtable_.containsKey(method))
{
Integer value = (Integer)m_hashtable_.get(method);
m_hashtable_.put(method, new Integer(value.intValue() + 1));
}
else
m_hashtable_.put(method, new Integer(1));
String temp = Integer.toHexString(ch);
StringBuffer s = new StringBuffer(temp);
for (int i = 0; i < 6 - temp.length(); i ++)
s.append(' ');
temp = UCharacter.getName(ch);
if (temp == null)
temp = " ";
s.append(temp);
for (int i = 0; i < 73 - temp.length(); i ++)
s.append(' ');
s.append(method);
for (int i = 0; i < 27 - method.length(); i ++)
s.append(' ');
s.append(ucharval);
for (int i = 0; i < 11 - ucharval.length(); i ++)
s.append(' ');
s.append(charval);
f.println(s.toString());
}
/**
* Does up a summary of the differences
* @param f file outputstream
*/
private static void summary(PrintWriter f)
{
f.println("==================================================");
f.println("Summary of differences");
for (Enumeration e = m_hashtable_.keys() ; e.hasMoreElements() ;)
{
StringBuffer method = new StringBuffer((String)e.nextElement());
int count = ((Integer)m_hashtable_.get(method.toString())).intValue();
for (int i = 30 - method.length(); i > 0; i --)
method.append(' ');
f.println(method + " " + count);
}
}
}

View file

@ -0,0 +1,664 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterTest.java,v $
* $Date: 2000/12/26 20:01:08 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.test.text;
import java.io.BufferedReader;
import java.io.FileReader;
import com.ibm.test.TestFmwk;
import com.ibm.icu.text.UCharacter;
import com.ibm.icu.text.UCharacterCategoryEnum;
import com.ibm.icu.text.UCharacterDirectionEnum;
import com.ibm.icu.text.UTF16;
/**
* Testing class for UCharacter
* Mostly following the test cases for ICU
* @author Syn Wee Quek
* @since nov 04 2000
*/
public final class UCharacterTest extends TestFmwk
{
// private variables =============================================
/**
* ICU4J data version number
*/
private final String VERSION_ = "3.0.0.0";
// constructor ===================================================
/**
* Constructor
*/
public UCharacterTest()
{
}
// public methods ================================================
/**
* Testing the uppercase and lowercase function of UCharacter
*/
public void TestUpperLower()
{
// variables to test the uppercase and lowercase characters
int upper[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0xb1, 0xb2,
0xb3, 0x48, 0x49, 0x4a, 0x2e, 0x3f, 0x3a, 0x4b, 0x4c,
0x4d, 0x4e, 0x4f, 0x01c4, 0x01c8, 0x000c, 0x0000};
int lower[] = {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xb1, 0x00b2,
0xb3, 0x68, 0x69, 0x6a, 0x2e, 0x3f, 0x3a, 0x6b, 0x6c,
0x6d, 0x6e, 0x6f, 0x01c6, 0x01c9, 0x000c, 0x0000};
int size = upper.length;
for (int i = 0; i < size; i ++)
{
if (UCharacter.isLetter(lower[i]) && !UCharacter.isLowerCase(lower[i]))
{
errln("FAIL isLowerCase test for 0x" +
Integer.toHexString(lower[i]));
break;
}
if (UCharacter.isLetter(upper[i]) && !(UCharacter.isUpperCase(upper[i])
|| UCharacter.isTitleCase(upper[i])))
{
errln("FAIL isUpperCase test for 0x" +
Integer.toHexString(upper[i]));
break;
}
if (lower[i] != UCharacter.toLowerCase(upper[i]) ||
(upper[i] != UCharacter.toUpperCase(lower[i]) &&
upper[i] != UCharacter.toTitleCase(lower[i])))
{
errln("FAIL case conversion test for 0x" +
Integer.toHexString(upper[i]) + " to 0x" +
Integer.toHexString(lower[i]));
break;
}
if (lower[i] != UCharacter.toLowerCase(lower[i]))
{
errln("FAIL lower case conversion test for 0x" +
Integer.toHexString(lower[i]));
break;
}
if (upper[i] != UCharacter.toUpperCase(upper[i]) &&
upper[i] != UCharacter.toTitleCase(upper[i]))
{
errln("FAIL upper case conversion test for 0x" +
Integer.toHexString(upper[i]));
break;
}
logln("Ok 0x" + Integer.toHexString(upper[i]) + " and 0x" +
Integer.toHexString(lower[i]));
}
}
/**
* Testing the letter and number determination in UCharacter
*/
public void TestLetterNumber()
{
for (int i = 0x0041; i < 0x005B; i ++)
if (!UCharacter.isLetter(i))
errln("FAIL 0x" + Integer.toHexString(i) + " expected to be a letter");
for (int i = 0x0660; i < 0x066A; i ++)
if (UCharacter.isLetter(i))
errln("FAIL 0x" + Integer.toHexString(i) +
" expected not to be a letter");
for (int i = 0x0660; i < 0x066A; i ++)
if (!UCharacter.isDigit(i))
errln("FAIL 0x" + Integer.toHexString(i) + " expected to be a digit");
for (int i = 0x0041; i < 0x005B; i ++)
if (!UCharacter.isLetterOrDigit(i))
errln("FAIL 0x" + Integer.toHexString(i) +
" expected not to be a digit");
for (int i = 0x0660; i < 0x066A; i ++)
if (!UCharacter.isLetterOrDigit(i))
errln("FAIL 0x" + Integer.toHexString(i) +
"expected to be either a letter or a digit");
}
/**
* Tests for space determination in UCharacter
*/
public void TestSpaces()
{
int spaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
int nonspaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
int whitespaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
int nonwhitespaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f};
int size = spaces.length;
for (int i = 0; i < size; i ++)
{
if (!UCharacter.isSpaceChar(spaces[i]))
{
errln("FAIL 0x" + Integer.toHexString(spaces[i]) +
" expected to be a space character");
break;
}
if (UCharacter.isSpaceChar(nonspaces[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonspaces[i]) +
" expected not to be space character");
break;
}
if (!UCharacter.isWhitespace(whitespaces[i]))
{
errln("FAIL 0x" + Integer.toHexString(whitespaces[i]) +
" expected to be a white space character");
break;
}
if (UCharacter.isWhitespace(nonwhitespaces[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonwhitespaces[i]) +
" expected not to be a space character");
break;
}
logln("Ok 0x" + Integer.toHexString(spaces[i]) + " and 0x" +
Integer.toHexString(nonspaces[i]) + " and 0x" +
Integer.toHexString(whitespaces[i]) + " and 0x" +
Integer.toHexString(nonwhitespaces[i]));
}
}
/**
* Tests for defined and undefined characters
*/
public void TestDefined()
{
int undefined[] = {0xfff1, 0xfff7, 0xfa30};
int defined[] = {0x523E, 0x4f88, 0xfffd};
int size = undefined.length;
for (int i = 0; i < size; i ++)
{
if (UCharacter.isDefined(undefined[i]))
{
errln("FAIL 0x" + Integer.toHexString(undefined[i]) +
" expected not to be defined");
break;
}
if (!UCharacter.isDefined(defined[i]))
{
errln("FAIL 0x" + Integer.toHexString(defined[i]) +
" expected defined");
break;
}
}
}
/**
* Tests for base characters and their cellwidth
*/
public void TestBase()
{
int base[] = {0x0061, 0x0031, 0x03d2};
int nonbase[] = {0x002B, 0x0020, 0x203B};
int size = base.length;
for (int i = 0; i < size; i ++)
{
if (UCharacter.isBaseForm(nonbase[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonbase[i]) +
" expected not to be a base character");
break;
}
if (!UCharacter.isBaseForm(base[i]))
{
errln("FAIL 0x" + Integer.toHexString(base[i]) +
" expected to be a base character");
break;
}
}
}
/**
* Tests for digit characters
*/
public void TestDigits()
{
int digits[] = {0x0030, 0x0662, 0x0F23, 0x0ED5, 0x2160};
//special characters not in the properties table
int digits2[] = {0x3007, 0x4e00, 0x4e8c, 0x4e09, 0x56d8, 0x4e94, 0x516d,
0x4e03, 0x516b, 0x4e5d};
int nondigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
int digitvalues[] = {0, 2, 3, 5, 1};
int digitvalues2[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int size = digits.length;
for (int i = 0; i < size; i ++)
if (UCharacter.isDigit(digits[i]) &&
UCharacter.digit(digits[i]) != digitvalues[i])
{
errln("FAIL 0x" + Integer.toHexString(digits[i]) +
" expected digit with value " + digitvalues[i]);
break;
}
size = nondigits.length;
for (int i = 0; i < size; i ++)
if (UCharacter.isDigit(nondigits[i]))
{
errln("FAIL 0x" + Integer.toHexString(nondigits[i]) +
" expected nondigit");
break;
}
size = digits2.length;
for (int i = 0; i < 10; i ++)
if (UCharacter.isDigit(digits2[i]) &&
UCharacter.digit(digits2[i]) != digitvalues2[i])
{
errln("FAIL 0x" + Integer.toHexString(digits2[i]) +
" expected digit with value " + digitvalues2[i]);
break;
}
}
/**
* Tests for version
*/
public void TestVersion()
{
String version = UCharacter.getUnicodeVersion();
if (!version.equals(VERSION_))
errln("FAIL expected " + VERSION_);
}
/**
* Tests for control characters
*/
public void TestControl()
{
int control[] = {0x001b, 0x0097, 0x0082};
int noncontrol[] = {0x61, 0x0031, 0x00e2};
int size = control.length;
for (int i = 0; i < size; i ++)
{
if (!UCharacter.isControl(control[i]))
{
errln("FAIL 0x" + Integer.toHexString(control[i]) +
" expected to be a control character");
break;
}
if (UCharacter.isControl(noncontrol[i]))
{
errln("FAIL 0x" + Integer.toHexString(noncontrol[i]) +
" expected to be not a control character");
break;
}
logln("Ok 0x" + Integer.toHexString(control[i]) + " and 0x" +
Integer.toHexString(noncontrol[i]));
}
}
/**
* Tests for printable characters
*/
public void TestPrint()
{
int printable[] = {0x0042, 0x005f, 0x2014};
int nonprintable[] = {0x200c, 0x009f, 0x001b};
int size = printable.length;
for (int i = 0; i < size; i ++)
{
if (!UCharacter.isPrintable(printable[i]))
{
errln("FAIL 0x" + Integer.toHexString(printable[i]) +
" expected to be a printable character");
break;
}
if (UCharacter.isPrintable(nonprintable[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonprintable[i]) +
" expected not to be a printable character");
break;
}
logln("Ok 0x" + Integer.toHexString(printable[i]) + " and 0x" +
Integer.toHexString(nonprintable[i]));
}
}
/**
* Testing for identifier characters
*/
public void TestIdentifier()
{
int unicodeidstart[] = {0x0250, 0x00e2, 0x0061};
int nonunicodeidstart[] = {0x2000, 0x000a, 0x2019};
int unicodeidpart[] = {0x005f, 0x0032, 0x0045};
int nonunicodeidpart[] = {0x2030, 0x00a3, 0x0020};
int idignore[] = {0x070F, 0x180B, 0x180C};
int nonidignore[] = {0x0075, 0x00a3, 0x0061};
int size = unicodeidstart.length;
for (int i = 0; i < size; i ++)
{
if (!UCharacter.isUnicodeIdentifierStart(unicodeidstart[i]))
{
errln("FAIL 0x" + Integer.toHexString(unicodeidstart[i]) +
" expected to be a unicode identifier start character");
break;
}
if (UCharacter.isUnicodeIdentifierStart(nonunicodeidstart[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonunicodeidstart[i]) +
" expected not to be a unicode identifier start character");
break;
}
if (!UCharacter.isUnicodeIdentifierPart(unicodeidpart[i]))
{
errln("FAIL 0x" + Integer.toHexString(unicodeidpart[i]) +
" expected to be a unicode identifier part character");
break;
}
if (UCharacter.isUnicodeIdentifierPart(nonunicodeidpart[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonunicodeidpart[i]) +
" expected not to be a unicode identifier part character");
break;
}
if (!UCharacter.isIdentifierIgnorable(idignore[i]))
{
errln("FAIL 0x" + Integer.toHexString(idignore[i]) +
" expected to be a ignorable unicode character");
break;
}
if (UCharacter.isIdentifierIgnorable(nonidignore[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonidignore[i]) +
" expected not to be a ignorable unicode character");
break;
}
logln("Ok 0x" + Integer.toHexString(unicodeidstart[i]) + " and 0x" +
Integer.toHexString(nonunicodeidstart[i]) + " and 0x" +
Integer.toHexString(unicodeidpart[i]) + " and 0x" +
Integer.toHexString(nonunicodeidpart[i]) + " and 0x" +
Integer.toHexString(idignore[i]) + " and 0x" +
Integer.toHexString(nonidignore[i]));
}
}
/**
* Tests for the character types, direction
*/
public void TestCatDir()
{
// this is the 2 char category types used in the UnicodeData file
final String TYPE =
"LuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf";
// directory types used in the UnicodeData file
// padded by spaces to make each type size 4
final String DIR =
"L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN ";
StringBuffer file = new StringBuffer("UnicodeData-");
file.append(UCharacter.getUnicodeVersion());
file.append(".txt");
String s;
final int LASTUNICODECHAR = 0xFFFD;
int ch = 0,
index = 0,
type = 0,
dir = 0;
try
{
// reading in the UnicodeData file
FileReader fr = new FileReader(file.toString());
BufferedReader input = new BufferedReader(fr);
while (ch != LASTUNICODECHAR)
{
s= input.readLine();
// geting the unicode character, its type and its direction
ch = Integer.parseInt(s.substring(0, 4), 16);
index = s.indexOf(';', 5);
String t = s.substring(index + 1, index + 3);
index = s.indexOf(';', index + 4);
String d = s.substring(index + 1, s.indexOf(';', index + 1));
// testing the category
// we override the general category of some control characters
if (ch == 9 || ch == 0xb || ch == 0x1f)
type = UCharacterCategoryEnum.SPACE_SEPARATOR;
else
if (ch == 0xc)
type = UCharacterCategoryEnum.LINE_SEPARATOR;
else
if (ch == 0xa || ch == 0xd || ch == 0x1c || ch == 0x1d ||
ch == 0x1e || ch == 0x85)
type = UCharacterCategoryEnum.PARAGRAPH_SEPARATOR;
else
{
type = TYPE.indexOf(t);
if (type < 0)
type = 0;
else
type = (type >> 1) + 1;
}
if (UCharacter.getType(ch) != type)
{
errln("FAIL 0x" + Integer.toHexString(ch) + " expected type " +
type);
break;
}
// testing the direction
if (d.length() == 1)
d = d + " ";
dir = DIR.indexOf(d) >> 2;
if (UCharacter.getDirection(ch) != dir)
{
errln("FAIL 0x" + Integer.toHexString(ch) +
" expected wrong direction " + dir);
break;
}
}
input.close();
}
catch (Exception e)
{
e.printStackTrace();
}
if (UCharacter.getDirection(0x10001) !=
UCharacterDirectionEnum.LEFT_TO_RIGHT)
errln("FAIL 0x10001 expected direction " +
UCharacterDirectionEnum.toString(UCharacterDirectionEnum.LEFT_TO_RIGHT));
}
/**
* Test for the character names
*/
public void TestNames()
{
int c[] = {0x0061, 0x0284, 0x3401, 0x7fed, 0xac00, 0xd7a3, 0xff08, 0xffe5};
String name[] = {"LATIN SMALL LETTER A",
"LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK",
"CJK UNIFIED IDEOGRAPH-3401",
"CJK UNIFIED IDEOGRAPH-7FED", "HANGUL SYLLABLE GA",
"HANGUL SYLLABLE HIH", "FULLWIDTH LEFT PARENTHESIS",
"FULLWIDTH YEN SIGN"};
String oldname[] = {"", "LATIN SMALL LETTER DOTLESS J BAR HOOK", "", "",
"", "", "FULLWIDTH OPENING PARENTHESIS", ""};
int size = c.length;
String str;
int uc;
for (int i = 0; i < size; i ++)
{
// modern Unicode character name
str = UCharacter.getName(c[i]);
if (!str.equalsIgnoreCase(name[i]))
{
errln("FAIL 0x" + Integer.toHexString(c[i]) + " expected name " +
name[i]);
break;
}
// 1.0 Unicode character name
str = UCharacter.getName1_0(c[i]);
if ((str == null && oldname[i].length() > 0) ||
(str != null && !str.equalsIgnoreCase(oldname[i])))
{
errln("FAIL 0x" + Integer.toHexString(c[i]) + " expected 1.0 name " +
oldname[i]);
break;
}
// retrieving unicode character from modern name
uc = UCharacter.getCharFromName(name[i]);
if (uc != c[i])
{
errln("FAIL " + name[i] + " expected character 0x" +
Integer.toHexString(c[i]));
break;
}
//retrieving unicode character from 1.0 name
uc = UCharacter.getCharFromName1_0(oldname[i]);
if (uc != c[i] && i != 0 && (i == 1 || i == 6))
{
errln("FAIL " + name[i] + " expected 1.0 character " +
Integer.toHexString(c[i]));
break;
}
}
// extra testing different from icu
for (int i = UCharacter.MIN_VALUE; i < UCharacter.MAX_VALUE; i ++)
{
str = UCharacter.getName(i);
if (str != null && UCharacter.getCharFromName(str) != i)
{
errln("FAIL 0x" + Integer.toHexString(i) + " " + str +
" retrieval of name and vice versa" );
break;
}
}
}
/**
* Testing UTF16 class methods append, getCharCount and bounds
*/
public void TestUTF16AppendBoundCount()
{
StringBuffer str = new StringBuffer("this is a string ");
int length;
for (int i = UCharacter.MIN_VALUE; i < UCharacter.MAX_VALUE; i ++)
{
length = str.length();
UTF16.append(str, i);
if (!UCharacter.isSupplementary(i))
{
if (UTF16.getCharCount(i) != 1)
{
errln("FAIL Counting BMP character size error" );
break;
}
if (str.length() != length + 1)
{
errln("FAIL Adding a BMP character error" );
break;
}
if (!UTF16.isSurrogate((char)i) &&
UTF16.bounds(str.toString(), str.length() - 1) !=
UTF16.SINGLE_CHAR_BOUNDARY)
{
errln("FAIL Finding BMP character bounds error" );
break;
}
}
else
{
if (UTF16.getCharCount(i) != 2)
{
errln("FAIL Counting Supplementary character size error" );
break;
}
if (str.length() != length + 2)
{
errln("FAIL Adding a Supplementary character error" );
break;
}
length = str.length();
if (UTF16.bounds(str.toString(), str.length() - 2) !=
UTF16.LEAD_SURROGATE_BOUNDARY ||
UTF16.bounds(str.toString(), str.length() - 1) !=
UTF16.TRAIL_SURROGATE_BOUNDARY)
{
errln("FAIL Finding Supplementary character bounds error" );
break;
}
}
}
}
/**
* Testing UTF16 class methods findCPOffset, findOffsetFromCP and charAt
*/
public void TestUTF16OffsetCharAt()
{
StringBuffer str = new StringBuffer("12345");
UTF16.append(str, 0x10001);
str.append("67890");
UTF16.append(str, 0x10002);
String s = str.toString();
if (UTF16.charAt(s, 0) != '1' || UTF16.charAt(s, 2) != '3' ||
UTF16.charAt(s, 5) != 0x10001 || UTF16.charAt(s, 6) != 0x10001 ||
UTF16.charAt(s, 12) != 0x10002 || UTF16.charAt(s, 13) != 0x10002)
errln("FAIL Getting character from string error" );
if (UTF16.findCPOffset(s, 3) != 3 || UTF16.findCPOffset(s, 5) != 5 ||
UTF16.findCPOffset(s, 6) != 6)
errln("FAIL Getting codepoint offset from string error" );
if (UTF16.findOffsetFromCP(s, 3) != 3 ||
UTF16.findOffsetFromCP(s, 5) != 5 ||
UTF16.findOffsetFromCP(s, 6) != 7)
errln("FAIL Getting UTF16 offset from codepoint in string error" );
}
public static void main(String[] arg)
{
try
{
UCharacterTest test = new UCharacterTest();
test.run(arg);
}
catch (Exception e)
{
e.printStackTrace();
}
}
}

View file

@ -0,0 +1,313 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/test/text/Attic/UCharacterCompare.java,v $
* $Date: 2000/12/26 20:01:08 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.test.text;
import com.ibm.icu.text.UCharacter;
import com.ibm.icu.text.UCharacterCategoryEnum;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.util.Hashtable;
import java.util.Enumeration;
/**
* A class to compare the difference in methods between java.lang.Character and
* UCharacter
* @author Syn Wee Quek
* @since oct 06 2000
* @see com.ibm.icu.text.UCharacter
*/
public final class UCharacterCompare
{
// private variables ================================================
private static Hashtable m_hashtable_ = new Hashtable();
// public methods ======================================================
/**
* Main testing method
*/
public static void main(String arg[])
{
try
{
FileWriter f;
if (arg.length == 0)
f = new FileWriter("compare.txt");
else
f = new FileWriter(arg[0]);
PrintWriter p = new PrintWriter(f);
p.print("char character name ");
p.println("method name ucharacter character");
for (char i = Character.MIN_VALUE; i < Character.MAX_VALUE; i ++)
{
if (UCharacter.isDefined(i) != Character.isDefined(i))
trackDifference(p, i, "isDefined()", "" + UCharacter.isDefined(i),
"" + Character.isDefined(i));
else
{
if (UCharacter.digit(i, 10) != Character.digit(i, 10))
trackDifference(p, i, "digit()", "" + UCharacter.digit(i, 10),
"" + Character.digit(i, 10));
if (UCharacter.getNumericValue(i) != Character.getNumericValue(i))
trackDifference(p, i, "getNumericValue()",
"" + UCharacter.getNumericValue(i),
"" + Character.getNumericValue(i));
if (!compareType(UCharacter.getType(i), Character.getType(i)))
trackDifference(p, i, "getType()", "" + UCharacter.getType(i),
"" + Character.getType(i));
if (UCharacter.isDigit(i) != Character.isDigit(i))
trackDifference(p, i, "isDigit()",
"" + UCharacter.isDigit(i),
"" + Character.isDigit(i));
if (UCharacter.isISOControl(i) != Character.isISOControl(i))
trackDifference(p, i, "isISOControl()",
"" + UCharacter.isISOControl(i),
"" + Character.isISOControl(i));
if (UCharacter.isLetter(i) != Character.isLetter(i))
trackDifference(p, i, "isLetter()", "" + UCharacter.isLetter(i),
"" + Character.isLetter(i));
if (UCharacter.isLetterOrDigit(i) != Character.isLetterOrDigit(i))
trackDifference(p, i, "isLetterOrDigit()",
"" + UCharacter.isLetterOrDigit(i),
"" + Character.isLetterOrDigit(i));
if (UCharacter.isLowerCase(i) != Character.isLowerCase(i))
trackDifference(p, i, "isLowerCase()",
"" + UCharacter.isLowerCase(i),
"" + Character.isLowerCase(i));
if (UCharacter.isWhitespace(i) != Character.isWhitespace(i))
trackDifference(p, i, "isWhitespace()",
"" + UCharacter.isWhitespace(i),
"" + Character.isWhitespace(i));
if (UCharacter.isSpaceChar(i) != Character.isSpaceChar(i))
trackDifference(p, i, "isSpaceChar()",
"" + UCharacter.isSpaceChar(i),
"" + Character.isSpaceChar(i));
if (UCharacter.isTitleCase(i) != Character.isTitleCase(i))
trackDifference(p, i, "isTitleChar()",
"" + UCharacter.isTitleCase(i),
"" + Character.isTitleCase(i));
if (UCharacter.isUnicodeIdentifierPart(i) !=
Character.isUnicodeIdentifierPart(i))
trackDifference(p, i, "isUnicodeIdentifierPart()",
"" + UCharacter.isUnicodeIdentifierPart(i),
"" + Character.isUnicodeIdentifierPart(i));
if (UCharacter.isUnicodeIdentifierStart(i) !=
Character.isUnicodeIdentifierStart(i))
trackDifference(p, i, "isUnicodeIdentifierStart()",
"" + UCharacter.isUnicodeIdentifierStart(i),
"" + Character.isUnicodeIdentifierStart(i));
if (UCharacter.isIdentifierIgnorable(i) !=
Character.isIdentifierIgnorable(i))
trackDifference(p, i, "isIdentifierIgnorable()",
"" + UCharacter.isIdentifierIgnorable(i),
"" + Character.isIdentifierIgnorable(i));
if (UCharacter.isUpperCase(i) != Character.isUpperCase(i))
trackDifference(p, i, "isUpperCase()",
"" + UCharacter.isUpperCase(i),
"" + Character.isUpperCase(i));
if (UCharacter.toLowerCase(i) != Character.toLowerCase(i))
trackDifference(p, i, "toLowerCase()",
Integer.toHexString(UCharacter.toLowerCase(i)),
Integer.toHexString(Character.toLowerCase(i)));
if (!UCharacter.toString(i).equals(new Character(i).toString()))
trackDifference(p, i, "toString()",
UCharacter.toString(i),
new Character(i).toString());
if (UCharacter.toTitleCase(i) != Character.toTitleCase(i))
trackDifference(p, i, "toTitleCase()",
Integer.toHexString(UCharacter.toTitleCase(i)),
Integer.toHexString(Character.toTitleCase(i)));
if (UCharacter.toUpperCase(i) != Character.toUpperCase(i))
trackDifference(p, i, "toUpperCase()",
Integer.toHexString(UCharacter.toUpperCase(i)),
Integer.toHexString(Character.toUpperCase(i)));
}
}
summary(p);
p.close();
}
catch (Exception e)
{
e.printStackTrace();
}
}
// private methods ===================================================
/**
* Comparing types
* @param uchartype UCharacter type
* @param jchartype java.lang.Character type
*/
private static boolean compareType(int uchartype, int jchartype)
{
if (uchartype == UCharacterCategoryEnum.UNASSIGNED &&
jchartype == Character.UNASSIGNED)
return true;
if (uchartype == UCharacterCategoryEnum.UPPERCASE_LETTER &&
jchartype == Character.UPPERCASE_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.LOWERCASE_LETTER &&
jchartype == Character.LOWERCASE_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.TITLECASE_LETTER &&
jchartype == Character.TITLECASE_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.MODIFIER_LETTER &&
jchartype == Character.MODIFIER_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.OTHER_LETTER &&
jchartype == Character.OTHER_LETTER)
return true;
if (uchartype == UCharacterCategoryEnum.NON_SPACING_MARK &&
jchartype == Character.NON_SPACING_MARK)
return true;
if (uchartype == UCharacterCategoryEnum.ENCLOSING_MARK &&
jchartype == Character.ENCLOSING_MARK)
return true;
if (uchartype == UCharacterCategoryEnum.COMBINING_SPACING_MARK &&
jchartype == Character.COMBINING_SPACING_MARK)
return true;
if (uchartype == UCharacterCategoryEnum.DECIMAL_DIGIT_NUMBER &&
jchartype == Character.DECIMAL_DIGIT_NUMBER)
return true;
if (uchartype == UCharacterCategoryEnum.LETTER_NUMBER &&
jchartype == Character.LETTER_NUMBER)
return true;
if (uchartype == UCharacterCategoryEnum.OTHER_NUMBER &&
jchartype == Character.OTHER_NUMBER)
return true;
if (uchartype == UCharacterCategoryEnum.SPACE_SEPARATOR &&
jchartype == Character.SPACE_SEPARATOR)
return true;
if (uchartype == UCharacterCategoryEnum.LINE_SEPARATOR &&
jchartype == Character.LINE_SEPARATOR)
return true;
if (uchartype == UCharacterCategoryEnum.PARAGRAPH_SEPARATOR &&
jchartype == Character.PARAGRAPH_SEPARATOR)
return true;
if (uchartype == UCharacterCategoryEnum.CONTROL &&
jchartype == Character.CONTROL)
return true;
if (uchartype == UCharacterCategoryEnum.FORMAT &&
jchartype == Character.FORMAT)
return true;
if (uchartype == UCharacterCategoryEnum.PRIVATE_USE &&
jchartype == Character.PRIVATE_USE)
return true;
if (uchartype == UCharacterCategoryEnum.SURROGATE &&
jchartype == Character.SURROGATE)
return true;
if (uchartype == UCharacterCategoryEnum.DASH_PUNCTUATION &&
jchartype == Character.DASH_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.START_PUNCTUATION &&
jchartype == Character.START_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.END_PUNCTUATION &&
jchartype == Character.END_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.CONNECTOR_PUNCTUATION &&
jchartype == Character.CONNECTOR_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.OTHER_PUNCTUATION &&
jchartype == Character.OTHER_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.MATH_SYMBOL &&
jchartype == Character.MATH_SYMBOL)
return true;
if (uchartype == UCharacterCategoryEnum.CURRENCY_SYMBOL &&
jchartype == Character.CURRENCY_SYMBOL)
return true;
if (uchartype == UCharacterCategoryEnum.MODIFIER_SYMBOL &&
jchartype == Character.MODIFIER_SYMBOL)
return true;
if (uchartype == UCharacterCategoryEnum.OTHER_SYMBOL &&
jchartype == Character.OTHER_SYMBOL)
return true;
if (uchartype == UCharacterCategoryEnum.INITIAL_PUNCTUATION &&
jchartype == Character.START_PUNCTUATION)
return true;
if (uchartype == UCharacterCategoryEnum.FINAL_PUNCTUATION &&
jchartype == Character.END_PUNCTUATION)
return true;
/*if (uchartype == UCharacterCategoryEnum.GENERAL_OTHER_TYPES &&
jchartype == Character.GENERAL_OTHER_TYPES)
return true;*/
return false;
}
/**
* Difference writing to file
* @param f file outputstream
* @param ch code point
* @param method for testing
* @param ucharval UCharacter value after running method
* @param charval Character value after running method
* @exception thrown when error occur in writing to file
*/
private static void trackDifference(PrintWriter f, int ch, String method,
String ucharval, String charval)
throws Exception
{
if (m_hashtable_.containsKey(method))
{
Integer value = (Integer)m_hashtable_.get(method);
m_hashtable_.put(method, new Integer(value.intValue() + 1));
}
else
m_hashtable_.put(method, new Integer(1));
String temp = Integer.toHexString(ch);
StringBuffer s = new StringBuffer(temp);
for (int i = 0; i < 6 - temp.length(); i ++)
s.append(' ');
temp = UCharacter.getName(ch);
if (temp == null)
temp = " ";
s.append(temp);
for (int i = 0; i < 73 - temp.length(); i ++)
s.append(' ');
s.append(method);
for (int i = 0; i < 27 - method.length(); i ++)
s.append(' ');
s.append(ucharval);
for (int i = 0; i < 11 - ucharval.length(); i ++)
s.append(' ');
s.append(charval);
f.println(s.toString());
}
/**
* Does up a summary of the differences
* @param f file outputstream
*/
private static void summary(PrintWriter f)
{
f.println("==================================================");
f.println("Summary of differences");
for (Enumeration e = m_hashtable_.keys() ; e.hasMoreElements() ;)
{
StringBuffer method = new StringBuffer((String)e.nextElement());
int count = ((Integer)m_hashtable_.get(method.toString())).intValue();
for (int i = 30 - method.length(); i > 0; i --)
method.append(' ');
f.println(method + " " + count);
}
}
}

View file

@ -0,0 +1,664 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/test/text/Attic/UCharacterTest.java,v $
* $Date: 2000/12/26 20:01:08 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.test.text;
import java.io.BufferedReader;
import java.io.FileReader;
import com.ibm.test.TestFmwk;
import com.ibm.icu.text.UCharacter;
import com.ibm.icu.text.UCharacterCategoryEnum;
import com.ibm.icu.text.UCharacterDirectionEnum;
import com.ibm.icu.text.UTF16;
/**
* Testing class for UCharacter
* Mostly following the test cases for ICU
* @author Syn Wee Quek
* @since nov 04 2000
*/
public final class UCharacterTest extends TestFmwk
{
// private variables =============================================
/**
* ICU4J data version number
*/
private final String VERSION_ = "3.0.0.0";
// constructor ===================================================
/**
* Constructor
*/
public UCharacterTest()
{
}
// public methods ================================================
/**
* Testing the uppercase and lowercase function of UCharacter
*/
public void TestUpperLower()
{
// variables to test the uppercase and lowercase characters
int upper[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0xb1, 0xb2,
0xb3, 0x48, 0x49, 0x4a, 0x2e, 0x3f, 0x3a, 0x4b, 0x4c,
0x4d, 0x4e, 0x4f, 0x01c4, 0x01c8, 0x000c, 0x0000};
int lower[] = {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0xb1, 0x00b2,
0xb3, 0x68, 0x69, 0x6a, 0x2e, 0x3f, 0x3a, 0x6b, 0x6c,
0x6d, 0x6e, 0x6f, 0x01c6, 0x01c9, 0x000c, 0x0000};
int size = upper.length;
for (int i = 0; i < size; i ++)
{
if (UCharacter.isLetter(lower[i]) && !UCharacter.isLowerCase(lower[i]))
{
errln("FAIL isLowerCase test for 0x" +
Integer.toHexString(lower[i]));
break;
}
if (UCharacter.isLetter(upper[i]) && !(UCharacter.isUpperCase(upper[i])
|| UCharacter.isTitleCase(upper[i])))
{
errln("FAIL isUpperCase test for 0x" +
Integer.toHexString(upper[i]));
break;
}
if (lower[i] != UCharacter.toLowerCase(upper[i]) ||
(upper[i] != UCharacter.toUpperCase(lower[i]) &&
upper[i] != UCharacter.toTitleCase(lower[i])))
{
errln("FAIL case conversion test for 0x" +
Integer.toHexString(upper[i]) + " to 0x" +
Integer.toHexString(lower[i]));
break;
}
if (lower[i] != UCharacter.toLowerCase(lower[i]))
{
errln("FAIL lower case conversion test for 0x" +
Integer.toHexString(lower[i]));
break;
}
if (upper[i] != UCharacter.toUpperCase(upper[i]) &&
upper[i] != UCharacter.toTitleCase(upper[i]))
{
errln("FAIL upper case conversion test for 0x" +
Integer.toHexString(upper[i]));
break;
}
logln("Ok 0x" + Integer.toHexString(upper[i]) + " and 0x" +
Integer.toHexString(lower[i]));
}
}
/**
* Testing the letter and number determination in UCharacter
*/
public void TestLetterNumber()
{
for (int i = 0x0041; i < 0x005B; i ++)
if (!UCharacter.isLetter(i))
errln("FAIL 0x" + Integer.toHexString(i) + " expected to be a letter");
for (int i = 0x0660; i < 0x066A; i ++)
if (UCharacter.isLetter(i))
errln("FAIL 0x" + Integer.toHexString(i) +
" expected not to be a letter");
for (int i = 0x0660; i < 0x066A; i ++)
if (!UCharacter.isDigit(i))
errln("FAIL 0x" + Integer.toHexString(i) + " expected to be a digit");
for (int i = 0x0041; i < 0x005B; i ++)
if (!UCharacter.isLetterOrDigit(i))
errln("FAIL 0x" + Integer.toHexString(i) +
" expected not to be a digit");
for (int i = 0x0660; i < 0x066A; i ++)
if (!UCharacter.isLetterOrDigit(i))
errln("FAIL 0x" + Integer.toHexString(i) +
"expected to be either a letter or a digit");
}
/**
* Tests for space determination in UCharacter
*/
public void TestSpaces()
{
int spaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005};
int nonspaces[] = {0x61, 0x62, 0x63, 0x64, 0x74};
int whitespaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c};
int nonwhitespaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f};
int size = spaces.length;
for (int i = 0; i < size; i ++)
{
if (!UCharacter.isSpaceChar(spaces[i]))
{
errln("FAIL 0x" + Integer.toHexString(spaces[i]) +
" expected to be a space character");
break;
}
if (UCharacter.isSpaceChar(nonspaces[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonspaces[i]) +
" expected not to be space character");
break;
}
if (!UCharacter.isWhitespace(whitespaces[i]))
{
errln("FAIL 0x" + Integer.toHexString(whitespaces[i]) +
" expected to be a white space character");
break;
}
if (UCharacter.isWhitespace(nonwhitespaces[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonwhitespaces[i]) +
" expected not to be a space character");
break;
}
logln("Ok 0x" + Integer.toHexString(spaces[i]) + " and 0x" +
Integer.toHexString(nonspaces[i]) + " and 0x" +
Integer.toHexString(whitespaces[i]) + " and 0x" +
Integer.toHexString(nonwhitespaces[i]));
}
}
/**
* Tests for defined and undefined characters
*/
public void TestDefined()
{
int undefined[] = {0xfff1, 0xfff7, 0xfa30};
int defined[] = {0x523E, 0x4f88, 0xfffd};
int size = undefined.length;
for (int i = 0; i < size; i ++)
{
if (UCharacter.isDefined(undefined[i]))
{
errln("FAIL 0x" + Integer.toHexString(undefined[i]) +
" expected not to be defined");
break;
}
if (!UCharacter.isDefined(defined[i]))
{
errln("FAIL 0x" + Integer.toHexString(defined[i]) +
" expected defined");
break;
}
}
}
/**
* Tests for base characters and their cellwidth
*/
public void TestBase()
{
int base[] = {0x0061, 0x0031, 0x03d2};
int nonbase[] = {0x002B, 0x0020, 0x203B};
int size = base.length;
for (int i = 0; i < size; i ++)
{
if (UCharacter.isBaseForm(nonbase[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonbase[i]) +
" expected not to be a base character");
break;
}
if (!UCharacter.isBaseForm(base[i]))
{
errln("FAIL 0x" + Integer.toHexString(base[i]) +
" expected to be a base character");
break;
}
}
}
/**
* Tests for digit characters
*/
public void TestDigits()
{
int digits[] = {0x0030, 0x0662, 0x0F23, 0x0ED5, 0x2160};
//special characters not in the properties table
int digits2[] = {0x3007, 0x4e00, 0x4e8c, 0x4e09, 0x56d8, 0x4e94, 0x516d,
0x4e03, 0x516b, 0x4e5d};
int nondigits[] = {0x0010, 0x0041, 0x0122, 0x68FE};
int digitvalues[] = {0, 2, 3, 5, 1};
int digitvalues2[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
int size = digits.length;
for (int i = 0; i < size; i ++)
if (UCharacter.isDigit(digits[i]) &&
UCharacter.digit(digits[i]) != digitvalues[i])
{
errln("FAIL 0x" + Integer.toHexString(digits[i]) +
" expected digit with value " + digitvalues[i]);
break;
}
size = nondigits.length;
for (int i = 0; i < size; i ++)
if (UCharacter.isDigit(nondigits[i]))
{
errln("FAIL 0x" + Integer.toHexString(nondigits[i]) +
" expected nondigit");
break;
}
size = digits2.length;
for (int i = 0; i < 10; i ++)
if (UCharacter.isDigit(digits2[i]) &&
UCharacter.digit(digits2[i]) != digitvalues2[i])
{
errln("FAIL 0x" + Integer.toHexString(digits2[i]) +
" expected digit with value " + digitvalues2[i]);
break;
}
}
/**
* Tests for version
*/
public void TestVersion()
{
String version = UCharacter.getUnicodeVersion();
if (!version.equals(VERSION_))
errln("FAIL expected " + VERSION_);
}
/**
* Tests for control characters
*/
public void TestControl()
{
int control[] = {0x001b, 0x0097, 0x0082};
int noncontrol[] = {0x61, 0x0031, 0x00e2};
int size = control.length;
for (int i = 0; i < size; i ++)
{
if (!UCharacter.isControl(control[i]))
{
errln("FAIL 0x" + Integer.toHexString(control[i]) +
" expected to be a control character");
break;
}
if (UCharacter.isControl(noncontrol[i]))
{
errln("FAIL 0x" + Integer.toHexString(noncontrol[i]) +
" expected to be not a control character");
break;
}
logln("Ok 0x" + Integer.toHexString(control[i]) + " and 0x" +
Integer.toHexString(noncontrol[i]));
}
}
/**
* Tests for printable characters
*/
public void TestPrint()
{
int printable[] = {0x0042, 0x005f, 0x2014};
int nonprintable[] = {0x200c, 0x009f, 0x001b};
int size = printable.length;
for (int i = 0; i < size; i ++)
{
if (!UCharacter.isPrintable(printable[i]))
{
errln("FAIL 0x" + Integer.toHexString(printable[i]) +
" expected to be a printable character");
break;
}
if (UCharacter.isPrintable(nonprintable[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonprintable[i]) +
" expected not to be a printable character");
break;
}
logln("Ok 0x" + Integer.toHexString(printable[i]) + " and 0x" +
Integer.toHexString(nonprintable[i]));
}
}
/**
* Testing for identifier characters
*/
public void TestIdentifier()
{
int unicodeidstart[] = {0x0250, 0x00e2, 0x0061};
int nonunicodeidstart[] = {0x2000, 0x000a, 0x2019};
int unicodeidpart[] = {0x005f, 0x0032, 0x0045};
int nonunicodeidpart[] = {0x2030, 0x00a3, 0x0020};
int idignore[] = {0x070F, 0x180B, 0x180C};
int nonidignore[] = {0x0075, 0x00a3, 0x0061};
int size = unicodeidstart.length;
for (int i = 0; i < size; i ++)
{
if (!UCharacter.isUnicodeIdentifierStart(unicodeidstart[i]))
{
errln("FAIL 0x" + Integer.toHexString(unicodeidstart[i]) +
" expected to be a unicode identifier start character");
break;
}
if (UCharacter.isUnicodeIdentifierStart(nonunicodeidstart[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonunicodeidstart[i]) +
" expected not to be a unicode identifier start character");
break;
}
if (!UCharacter.isUnicodeIdentifierPart(unicodeidpart[i]))
{
errln("FAIL 0x" + Integer.toHexString(unicodeidpart[i]) +
" expected to be a unicode identifier part character");
break;
}
if (UCharacter.isUnicodeIdentifierPart(nonunicodeidpart[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonunicodeidpart[i]) +
" expected not to be a unicode identifier part character");
break;
}
if (!UCharacter.isIdentifierIgnorable(idignore[i]))
{
errln("FAIL 0x" + Integer.toHexString(idignore[i]) +
" expected to be a ignorable unicode character");
break;
}
if (UCharacter.isIdentifierIgnorable(nonidignore[i]))
{
errln("FAIL 0x" + Integer.toHexString(nonidignore[i]) +
" expected not to be a ignorable unicode character");
break;
}
logln("Ok 0x" + Integer.toHexString(unicodeidstart[i]) + " and 0x" +
Integer.toHexString(nonunicodeidstart[i]) + " and 0x" +
Integer.toHexString(unicodeidpart[i]) + " and 0x" +
Integer.toHexString(nonunicodeidpart[i]) + " and 0x" +
Integer.toHexString(idignore[i]) + " and 0x" +
Integer.toHexString(nonidignore[i]));
}
}
/**
* Tests for the character types, direction
*/
public void TestCatDir()
{
// this is the 2 char category types used in the UnicodeData file
final String TYPE =
"LuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf";
// directory types used in the UnicodeData file
// padded by spaces to make each type size 4
final String DIR =
"L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN ";
StringBuffer file = new StringBuffer("UnicodeData-");
file.append(UCharacter.getUnicodeVersion());
file.append(".txt");
String s;
final int LASTUNICODECHAR = 0xFFFD;
int ch = 0,
index = 0,
type = 0,
dir = 0;
try
{
// reading in the UnicodeData file
FileReader fr = new FileReader(file.toString());
BufferedReader input = new BufferedReader(fr);
while (ch != LASTUNICODECHAR)
{
s= input.readLine();
// geting the unicode character, its type and its direction
ch = Integer.parseInt(s.substring(0, 4), 16);
index = s.indexOf(';', 5);
String t = s.substring(index + 1, index + 3);
index = s.indexOf(';', index + 4);
String d = s.substring(index + 1, s.indexOf(';', index + 1));
// testing the category
// we override the general category of some control characters
if (ch == 9 || ch == 0xb || ch == 0x1f)
type = UCharacterCategoryEnum.SPACE_SEPARATOR;
else
if (ch == 0xc)
type = UCharacterCategoryEnum.LINE_SEPARATOR;
else
if (ch == 0xa || ch == 0xd || ch == 0x1c || ch == 0x1d ||
ch == 0x1e || ch == 0x85)
type = UCharacterCategoryEnum.PARAGRAPH_SEPARATOR;
else
{
type = TYPE.indexOf(t);
if (type < 0)
type = 0;
else
type = (type >> 1) + 1;
}
if (UCharacter.getType(ch) != type)
{
errln("FAIL 0x" + Integer.toHexString(ch) + " expected type " +
type);
break;
}
// testing the direction
if (d.length() == 1)
d = d + " ";
dir = DIR.indexOf(d) >> 2;
if (UCharacter.getDirection(ch) != dir)
{
errln("FAIL 0x" + Integer.toHexString(ch) +
" expected wrong direction " + dir);
break;
}
}
input.close();
}
catch (Exception e)
{
e.printStackTrace();
}
if (UCharacter.getDirection(0x10001) !=
UCharacterDirectionEnum.LEFT_TO_RIGHT)
errln("FAIL 0x10001 expected direction " +
UCharacterDirectionEnum.toString(UCharacterDirectionEnum.LEFT_TO_RIGHT));
}
/**
* Test for the character names
*/
public void TestNames()
{
int c[] = {0x0061, 0x0284, 0x3401, 0x7fed, 0xac00, 0xd7a3, 0xff08, 0xffe5};
String name[] = {"LATIN SMALL LETTER A",
"LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK",
"CJK UNIFIED IDEOGRAPH-3401",
"CJK UNIFIED IDEOGRAPH-7FED", "HANGUL SYLLABLE GA",
"HANGUL SYLLABLE HIH", "FULLWIDTH LEFT PARENTHESIS",
"FULLWIDTH YEN SIGN"};
String oldname[] = {"", "LATIN SMALL LETTER DOTLESS J BAR HOOK", "", "",
"", "", "FULLWIDTH OPENING PARENTHESIS", ""};
int size = c.length;
String str;
int uc;
for (int i = 0; i < size; i ++)
{
// modern Unicode character name
str = UCharacter.getName(c[i]);
if (!str.equalsIgnoreCase(name[i]))
{
errln("FAIL 0x" + Integer.toHexString(c[i]) + " expected name " +
name[i]);
break;
}
// 1.0 Unicode character name
str = UCharacter.getName1_0(c[i]);
if ((str == null && oldname[i].length() > 0) ||
(str != null && !str.equalsIgnoreCase(oldname[i])))
{
errln("FAIL 0x" + Integer.toHexString(c[i]) + " expected 1.0 name " +
oldname[i]);
break;
}
// retrieving unicode character from modern name
uc = UCharacter.getCharFromName(name[i]);
if (uc != c[i])
{
errln("FAIL " + name[i] + " expected character 0x" +
Integer.toHexString(c[i]));
break;
}
//retrieving unicode character from 1.0 name
uc = UCharacter.getCharFromName1_0(oldname[i]);
if (uc != c[i] && i != 0 && (i == 1 || i == 6))
{
errln("FAIL " + name[i] + " expected 1.0 character " +
Integer.toHexString(c[i]));
break;
}
}
// extra testing different from icu
for (int i = UCharacter.MIN_VALUE; i < UCharacter.MAX_VALUE; i ++)
{
str = UCharacter.getName(i);
if (str != null && UCharacter.getCharFromName(str) != i)
{
errln("FAIL 0x" + Integer.toHexString(i) + " " + str +
" retrieval of name and vice versa" );
break;
}
}
}
/**
* Testing UTF16 class methods append, getCharCount and bounds
*/
public void TestUTF16AppendBoundCount()
{
StringBuffer str = new StringBuffer("this is a string ");
int length;
for (int i = UCharacter.MIN_VALUE; i < UCharacter.MAX_VALUE; i ++)
{
length = str.length();
UTF16.append(str, i);
if (!UCharacter.isSupplementary(i))
{
if (UTF16.getCharCount(i) != 1)
{
errln("FAIL Counting BMP character size error" );
break;
}
if (str.length() != length + 1)
{
errln("FAIL Adding a BMP character error" );
break;
}
if (!UTF16.isSurrogate((char)i) &&
UTF16.bounds(str.toString(), str.length() - 1) !=
UTF16.SINGLE_CHAR_BOUNDARY)
{
errln("FAIL Finding BMP character bounds error" );
break;
}
}
else
{
if (UTF16.getCharCount(i) != 2)
{
errln("FAIL Counting Supplementary character size error" );
break;
}
if (str.length() != length + 2)
{
errln("FAIL Adding a Supplementary character error" );
break;
}
length = str.length();
if (UTF16.bounds(str.toString(), str.length() - 2) !=
UTF16.LEAD_SURROGATE_BOUNDARY ||
UTF16.bounds(str.toString(), str.length() - 1) !=
UTF16.TRAIL_SURROGATE_BOUNDARY)
{
errln("FAIL Finding Supplementary character bounds error" );
break;
}
}
}
}
/**
* Testing UTF16 class methods findCPOffset, findOffsetFromCP and charAt
*/
public void TestUTF16OffsetCharAt()
{
StringBuffer str = new StringBuffer("12345");
UTF16.append(str, 0x10001);
str.append("67890");
UTF16.append(str, 0x10002);
String s = str.toString();
if (UTF16.charAt(s, 0) != '1' || UTF16.charAt(s, 2) != '3' ||
UTF16.charAt(s, 5) != 0x10001 || UTF16.charAt(s, 6) != 0x10001 ||
UTF16.charAt(s, 12) != 0x10002 || UTF16.charAt(s, 13) != 0x10002)
errln("FAIL Getting character from string error" );
if (UTF16.findCPOffset(s, 3) != 3 || UTF16.findCPOffset(s, 5) != 5 ||
UTF16.findCPOffset(s, 6) != 6)
errln("FAIL Getting codepoint offset from string error" );
if (UTF16.findOffsetFromCP(s, 3) != 3 ||
UTF16.findOffsetFromCP(s, 5) != 5 ||
UTF16.findOffsetFromCP(s, 6) != 7)
errln("FAIL Getting UTF16 offset from codepoint in string error" );
}
public static void main(String[] arg)
{
try
{
UCharacterTest test = new UCharacterTest();
test.run(arg);
}
catch (Exception e)
{
e.printStackTrace();
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,248 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterCategoryEnum.java $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* Enumerated Unicode category types from the UnicodeData.txt file.
* Used as return results from <a href=UCharacter.html>UCharacter</a>
* Equivalent to icu's UCharCategory.
* Refer to <a href=http://www.unicode.org/Public/UNIDATA/UnicodeData.html>
* Unicode Consortium</a> for more information about UnicodeData.txt.
* @author Syn Wee Quek
* @since oct0300
*/
public final class UCharacterCategoryEnum
{
// private constructor ===================================================
/**
* Private constructor to prevent initialisation
*/
private UCharacterCategoryEnum()
{
}
// public variable =======================================================
/**
* Unassigned character type
*/
public static final int UNASSIGNED = 0;
/**
* Character type Lu
*/
public static final int UPPERCASE_LETTER = UNASSIGNED + 1;
/**
* Character type Ll
*/
public static final int LOWERCASE_LETTER = UPPERCASE_LETTER + 1;
/**
* Character type Lt
*/
public static final int TITLECASE_LETTER = LOWERCASE_LETTER + 1;
/**
* Character type Lm
*/
public static final int MODIFIER_LETTER = TITLECASE_LETTER + 1;
/**
* Character type Lo
*/
public static final int OTHER_LETTER = MODIFIER_LETTER + 1;
/**
* Character type Lu
*/
public static final int NON_SPACING_MARK = OTHER_LETTER + 1;
/**
* Character type Me
*/
public static final int ENCLOSING_MARK = NON_SPACING_MARK + 1;
/**
* Character type Mc
*/
public static final int COMBINING_SPACING_MARK = ENCLOSING_MARK + 1;
/**
* Character type Nd
*/
public static final int DECIMAL_DIGIT_NUMBER = COMBINING_SPACING_MARK + 1;
/**
* Character type Nl
*/
public static final int LETTER_NUMBER = DECIMAL_DIGIT_NUMBER + 1;
// start of 11------------
/**
* Character type No
*/
public static final int OTHER_NUMBER = LETTER_NUMBER + 1;
/**
* Character type Zs
*/
public static final int SPACE_SEPARATOR = OTHER_NUMBER + 1;
/**
* Character type Zl
*/
public static final int LINE_SEPARATOR = SPACE_SEPARATOR + 1;
/**
* Character type Zp
*/
public static final int PARAGRAPH_SEPARATOR = LINE_SEPARATOR + 1;
/**
* Character type Cc
*/
public static final int CONTROL = PARAGRAPH_SEPARATOR + 1;
/**
* Character type Cf
*/
public static final int FORMAT = CONTROL + 1;
/**
* Character type Co
*/
public static final int PRIVATE_USE = FORMAT + 1;
/**
* Character type Cs
*/
public static final int SURROGATE = PRIVATE_USE + 1;
/**
* Character type Pd
*/
public static final int DASH_PUNCTUATION = SURROGATE + 1;
/**
* Character type Ps
*/
public static final int START_PUNCTUATION = DASH_PUNCTUATION + 1;
// start of 21 ------------
/**
* Character type Pe
*/
public static final int END_PUNCTUATION = START_PUNCTUATION + 1;
/**
* Character type Pc
*/
public static final int CONNECTOR_PUNCTUATION = END_PUNCTUATION + 1;
/**
* Character type Po
*/
public static final int OTHER_PUNCTUATION = CONNECTOR_PUNCTUATION + 1;
/**
* Character type Sm
*/
public static final int MATH_SYMBOL = OTHER_PUNCTUATION + 1;
/**
* Character type Sc
*/
public static final int CURRENCY_SYMBOL = MATH_SYMBOL + 1;
/**
* Character type Sk
*/
public static final int MODIFIER_SYMBOL = CURRENCY_SYMBOL + 1;
/**
* Character type So
*/
public static final int OTHER_SYMBOL = MODIFIER_SYMBOL + 1;
/**
* Character type Pi
*/
public static final int INITIAL_PUNCTUATION = OTHER_SYMBOL + 1;
/**
* Character type Pf
*/
public static final int FINAL_PUNCTUATION = INITIAL_PUNCTUATION + 1;
/**
* Character type Cn
*/
public static final int GENERAL_OTHER_TYPES = FINAL_PUNCTUATION + 1;
// start of 31 ------------
/**
* Character type count
*/
public static final int CHAR_CATEGORY_COUNT = GENERAL_OTHER_TYPES + 1;
/**
* Gets the name of the argument category
* @param category to retrieve name
* @return category name
*/
public static String toString(int category)
{
switch (category)
{
case UPPERCASE_LETTER :
return "Letter, Uppercase";
case LOWERCASE_LETTER :
return "Letter, Lowercase";
case TITLECASE_LETTER :
return "Letter, Titlecase";
case MODIFIER_LETTER :
return "Letter, Modifier";
case OTHER_LETTER :
return "Letter, Other";
case NON_SPACING_MARK :
return "Mark, Non-Spacing";
case ENCLOSING_MARK :
return "Mark, Enclosing";
case COMBINING_SPACING_MARK :
return "Mark, Spacing Combining";
case DECIMAL_DIGIT_NUMBER :
return "Number, Decimal Digit";
case LETTER_NUMBER :
return "Number, Letter";
case OTHER_NUMBER :
return "Number, Other";
case SPACE_SEPARATOR :
return "Separator, Space";
case LINE_SEPARATOR :
return "Separator, Line";
case PARAGRAPH_SEPARATOR :
return "Separator, Paragraph";
case CONTROL :
return "Other, Control";
case FORMAT :
return "Other, Format";
case PRIVATE_USE :
return "Other, Private Use";
case SURROGATE :
return "Other, Surrogate";
case DASH_PUNCTUATION :
return "Punctuation, Dash";
case START_PUNCTUATION :
return "Punctuation, Open";
case END_PUNCTUATION :
return "Punctuation, Close";
case CONNECTOR_PUNCTUATION :
return "Punctuation, Connector";
case OTHER_PUNCTUATION :
return "Punctuation, Other";
case MATH_SYMBOL :
return "Symbol, Math";
case CURRENCY_SYMBOL :
return "Symbol, Currency";
case MODIFIER_SYMBOL :
return "Symbol, Modifier";
case OTHER_SYMBOL :
return "Symbol, Other";
case INITIAL_PUNCTUATION :
return "Punctuation, Initial quote ";
case FINAL_PUNCTUATION :
return "Punctuation, Final quote ";
}
return "Unassigned";
}
}

View file

@ -0,0 +1,99 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UCharacterDB.java,v $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* Internal base class for all character databases.
* Database classes store binary data read from uprops.dat and unames for use.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.
* Due to the form most commonly used for retrieval, array of char is used
* to store the binary data
* Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacterPpty.html>UCharacterPpty</a> and
* <a href=UCharacterName.html>UCharacterName</a>.
* Data populated by <a href=UGenReader.html>UGenReader</a>
* @author Syn Wee Quek
* @since oct3100 HALLOWEEN!!
* @see com.ibm.icu.text.UCharacterPpty
* @see com.ibm.icu.text.UCharacterName
*/
class UCharacterDB
{
// protected variable ===========================================
/**
* Unicode data version
*/
String m_unicodeversion_;
// constructor =============================================
/**
* Constructor for UCharacterDB
*/
protected UCharacterDB()
{
}
// public method =============================================
/**
* toString method for printing
*/
public String toString()
{
StringBuffer result = new StringBuffer();
/*for (int i = 0; i < size; i ++)
{
result.append(" ");
result.append(0x0000FFFF & m_db_[i]);
}
result.append('\n');
*/
result.append("\nunicode version number ");
result.append(m_unicodeversion_);
return result.toString();
}
// protected method =============================================
/**
* set version number for this set of unicode characters
* @param version
* @return false if version is not a valid number
*/
protected boolean setUnicodeVersion(byte[] version)
{
int size = 0;
if (version != null)
size = version.length;
boolean result = false;
StringBuffer s = new StringBuffer(size);
for (int i = 0; i < size; i++)
{
s.append((int)version[i]);
s.append('.');
if (version[i] < 0 || version[i] > 9)
return false;
if (version[i] != 0)
result = true;
}
if (result)
m_unicodeversion_ = s.substring(0, (size << 1) - 1);
return true;
}
}

View file

@ -0,0 +1,182 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterDirectionEnum.java $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* Enumerated Unicode character linguistic direction constants.
* Used as return results from <a href=UCharacter.html>UCharacter</a>
* @author Syn Wee Quek
* @since oct0300
*/
public final class UCharacterDirectionEnum
{
// private constructor =========================================
/**
* Private constructor to prevent initialisation
*/
private UCharacterDirectionEnum()
{
}
// public variable =============================================
/**
* Directional type L
*/
public static final int LEFT_TO_RIGHT = 0;
/**
* Directional type R
*/
public static final int RIGHT_TO_LEFT = LEFT_TO_RIGHT + 1;
/**
* Directional type EN
*/
public static final int EUROPEAN_NUMBER = RIGHT_TO_LEFT + 1;
/**
* Directional type ES
*/
public static final int EUROPEAN_NUMBER_SEPARATOR = EUROPEAN_NUMBER + 1;
/**
* Directional type ET
*/
public static final int EUROPEAN_NUMBER_TERMINATOR =
EUROPEAN_NUMBER_SEPARATOR + 1;
/**
* Directional type AN
*/
public static final int ARABIC_NUMBER =
EUROPEAN_NUMBER_TERMINATOR + 1;
/**
* Directional type CS
*/
public static final int COMMON_NUMBER_SEPARATOR = ARABIC_NUMBER + 1;
/**
* Directional type B
*/
public static final int BLOCK_SEPARATOR =
COMMON_NUMBER_SEPARATOR + 1;
/**
* Directional type S
*/
public static final int SEGMENT_SEPARATOR = BLOCK_SEPARATOR + 1;
/**
* Directional type WS
*/
public static final int WHITE_SPACE_NEUTRAL = SEGMENT_SEPARATOR + 1;
// start of 11 ---------------
/**
* Directional type ON
*/
public static final int OTHER_NEUTRAL =
WHITE_SPACE_NEUTRAL + 1;
/**
* Directional type LRE
*/
public static final int LEFT_TO_RIGHT_EMBEDDING = OTHER_NEUTRAL + 1;
/**
* Directional type LRO
*/
public static final int LEFT_TO_RIGHT_OVERRIDE =
LEFT_TO_RIGHT_EMBEDDING + 1;
/**
* Directional type AL
*/
public static final int RIGHT_TO_LEFT_ARABIC =
LEFT_TO_RIGHT_OVERRIDE + 1;
/**
* Directional type RLE
*/
public static final int RIGHT_TO_LEFT_EMBEDDING =
RIGHT_TO_LEFT_ARABIC + 1;
/**
* Directional type RLO
*/
public static final int RIGHT_TO_LEFT_OVERRIDE =
RIGHT_TO_LEFT_EMBEDDING + 1;
/**
* Directional type PDF
*/
public static final int POP_DIRECTIONAL_FORMAT =
RIGHT_TO_LEFT_OVERRIDE + 1;
/**
* Directional type NSM
*/
public static final int DIR_NON_SPACING_MARK =
POP_DIRECTIONAL_FORMAT + 1;
/**
* Directional type BN
*/
public static final int BOUNDARY_NEUTRAL =
DIR_NON_SPACING_MARK + 1;
/**
* Number of directional type
*/
public static final int CHAR_DIRECTION_COUNT = BOUNDARY_NEUTRAL + 1;
/**
* Gets the name of the argument direction
* @param dir direction type to retrieve name
* @return directional name
*/
public static String toString(int dir)
{
switch(dir)
{
case LEFT_TO_RIGHT :
return "Left-to-Right";
case RIGHT_TO_LEFT :
return "Right-to-Left";
case EUROPEAN_NUMBER :
return "European Number";
case EUROPEAN_NUMBER_SEPARATOR :
return "European Number Separator";
case EUROPEAN_NUMBER_TERMINATOR :
return "European Number Terminator";
case ARABIC_NUMBER :
return "Arabic Number";
case COMMON_NUMBER_SEPARATOR :
return "Common Number Separator";
case BLOCK_SEPARATOR :
return "Paragraph Separator";
case SEGMENT_SEPARATOR :
return "Segment Separator";
case WHITE_SPACE_NEUTRAL :
return "Whitespace";
case OTHER_NEUTRAL :
return "Other Neutrals";
case LEFT_TO_RIGHT_EMBEDDING :
return "Left-to-Right Embedding";
case LEFT_TO_RIGHT_OVERRIDE :
return "Left-to-Right Override";
case RIGHT_TO_LEFT_ARABIC :
return "Right-to-Left Arabic";
case RIGHT_TO_LEFT_EMBEDDING :
return "Right-to-Left Embedding";
case RIGHT_TO_LEFT_OVERRIDE :
return "Right-to-Left Override";
case POP_DIRECTIONAL_FORMAT :
return "Pop Directional Format";
case DIR_NON_SPACING_MARK :
return "Non-Spacing Mark";
case BOUNDARY_NEUTRAL :
return "Boundary Neutral";
}
return "Unassigned";
}
}

View file

@ -0,0 +1,179 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterName.java $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* Internal class to manage character names.
* <a href=UCharacterNameDB.html>UCharacterNameDB</a> provides the data
* required and UCharacterName parses it into meaningful results before
* returning value.
* Since data in <a href=UCharacterNameDB.html>UCharacterNameDB</a> is stored
* in an array of char, by default indexes used in this class is refering to
* a 2 byte count, unless otherwise stated. Cases where the index is refering
* to a byte count, the index is halved and depending on whether the index is
* even or odd, the MSB or LSB of the result char at the halved index is
* returned. For indexes to an array of int, the index is multiplied by 2,
* result char at the multiplied index and its following char is returned as an
* int.
* <a href=UCharacter.html>UCharacter</a> acts as a public facade for this class
* Note : 0 - 0x1F are control characters without names in Unicode 3.0
* For information on parsing of the binary data in
* <a href=UCharacterNameDB.html>UCharacterNameDB</a> is located at
* <a href=oss.software.ibm.com/icu4j/icu4jhtml/com/ibm/icu/text/readme.html>
* ReadMe</a>
* @author Syn Wee Quek
* @since nov0700
*/
final class UCharacterName
{
// private variable =============================================
/**
* Database storing the sets of character name
*/
private static final UCharacterNameDB NAME_DB_;
// block to initialise name database and unicode 1.0 data indicator
static
{
try
{
NAME_DB_ = new UCharacterNameDB();
}
catch (Exception e)
{
throw new RuntimeException(e.getMessage());
}
}
// protected method =============================================
/**
* Retrieve the name of a Unicode code point.
* Depending on <code>choice</code>, the character name written into the
* buffer is the "modern" name or the name that was defined in Unicode
* version 1.0.
* The name contains only "invariant" characters
* like A-Z, 0-9, space, and '-'.
*
* @param ch the code point for which to get the name.
* @param choice Selector for which name to get.
* @return if code point is above 0x1fff, null is returned
*/
protected static String getName(int ch, int choice)
{
if (ch < 0 || ch > 0x1ffff ||
choice >= UCharacterNameChoiceEnum.U_CHAR_NAME_CHOICE_COUNT)
return null;
String result = "";
// Do not write algorithmic Unicode 1.0 names because Unihan names are
// the same as the modern ones, extension A was only introduced with
// Unicode 3.0, and the Hangul syllable block was moved and changed around
// Unicode 1.1.5.
if (choice == UCharacterNameChoiceEnum.U_UNICODE_CHAR_NAME)
// try getting algorithmic name first
result = getAlgName(ch);
// getting normal character name
if (result == null || result.length() == 0)
result = NAME_DB_.getGroupName(ch, choice);
return result;
}
/**
* Find a character by its name and return its code point value
* @param character name
* @param choice selector to indicate if argument name is a Unicode 1.0
* or the most current version
* @return code point
*/
protected static int getCharFromName(int choice, String name)
{
// checks for illegal arguments
if (choice >= UCharacterNameChoiceEnum.U_CHAR_NAME_CHOICE_COUNT ||
name == null || name.length() == 0)
return -1;
// try algorithmic names first, if fails then try group names
int result = getAlgorithmChar(choice, name);
if (result >= 0)
return result;
return getGroupChar(name, choice);
}
// private method =============================================
/**
* Gets the algorithmic name for the argument character
* @param ch character to determine name for
* @return the algorithmic name or null if not found
*/
private static String getAlgName(int ch)
{
// index in terms integer index
StringBuffer s = new StringBuffer();
int index = NAME_DB_.getAlgorithmIndex(ch);
if (index >= 0)
{
NAME_DB_.appendAlgorithmName(index, ch, s);
return s.toString();
}
return null;
}
/**
* Gets the character for the argument algorithmic name
* @param choice of either 1.0 or the most current unicode name
* @return the algorithmic char or -1 otherwise.
*/
private static int getAlgorithmChar(int choice, String name)
{
// 1.0 has no algorithmic names
if (choice != UCharacterNameChoiceEnum.U_UNICODE_CHAR_NAME)
return -1;
int result;
for (int count = NAME_DB_.countAlgorithm() - 1; count >= 0; count --)
{
result = NAME_DB_.getAlgorithmChar(count, name);
if (result >= 0)
return result;
}
return -1;
}
/**
* Getting the character with the tokenized argument name
* @param name of the character
* @return character with the tokenized argument name or -1 if character is
* not found
*/
private static int getGroupChar(String name, int choice)
{
int groupcount = NAME_DB_.countGroup();
int result = 0;
for (int i = 0; i < groupcount; i ++)
{
result = NAME_DB_.getGroupChar(i, name, choice);
if (result != -1)
return result;
}
return -1;
}
}

View file

@ -0,0 +1,34 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterNameChoiceEnum.java $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* Internal class containing selector constants for the unicode character names.
* Constants representing the "modern" name of a Unicode character or the name
* that was defined in Unicode version 1.0, before the Unicode standard
* merged with ISO-10646.
* Arguments for <a href=UCharacterName.html>UCharacterName</a>
* @author Syn Wee Quek
* @since oct0600
*/
interface UCharacterNameChoiceEnum
{
// public variables =============================================
static final int U_UNICODE_CHAR_NAME = 0;
static final int U_UNICODE_10_CHAR_NAME = U_UNICODE_CHAR_NAME + 1;
static final int U_CHAR_NAME_CHOICE_COUNT = U_UNICODE_10_CHAR_NAME + 1;
}

View file

@ -0,0 +1,877 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UCharacterNameDB.java,v $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.BufferedInputStream;
/**
* Internal class used for Unicode character name database.
* Database classes store binary data read from uprops.dat and unames for use.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.
* Due to the form most commonly used for retrieval, array of char is used
* to store the binary data
* UCharacterNameDB also contains indexes to significant points in the binary
* data.
* Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacterPpty.html>UCharacterPpty</a> and
* <a href=UCharacterName.html>UCharacterName</a>.
* Data populated by <a href=UGenNameReader.html>UGenNameReader</a>
* @author Syn Wee Quek
* @since oct2700
* @see com.ibm.icu.text.UGenReader
*/
final class UCharacterNameDB extends UCharacterDB
{
// private variable =============================================
/**
* Data used in unames.dat
*/
private char m_tokentable_[];
private byte m_tokenstring_[];
private char m_groupinfo_[];
private byte m_groupstring_[];
private AlgorithmName m_algorithm_[];
/**
* Number of group sets
*/
private int m_groupcount_ = 0;
private int m_groupsize_ = 0;
/**
* Default name of the name datafile
*/
private static final String NAME_FILE_NAME_ = "unames.dat";
/**
* Default buffer size of datafile
*/
private static final int NAME_BUFFER_SIZE_ = 100000;
/**
* Shift count to retrieve group information
*/
private static final int GROUP_SHIFT_ = 5;
/**
* Number of lines per group
*/
private static final int LINES_PER_GROUP_ = 1 << GROUP_SHIFT_;
/**
* Mask to retrieve the offset for a particular character within a group
*/
private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1;
/**
* Position of offsethigh in group information array
*/
private static final int OFFSET_HIGH_OFFSET_ = 1;
/**
* Position of offsetlow in group information array
*/
private static final int OFFSET_LOW_OFFSET_ = 2;
/**
* Indicator of if Unicode 1.0 names are available
*/
private static boolean UNICODE_1_;
/**
* Double nibble indicator, any nibble > this number has to be combined
* with its following nibble
*/
private static final int SINGLE_NIBBLE_MAX_ = 11;
// constructor ====================================================
/**
* protected constructor
* @exception thrown when data reading fails or when data has been corrupted
*/
protected UCharacterNameDB() throws Exception
{
UGenNameReader reader = new UGenNameReader();
InputStream i = getClass().getResourceAsStream(NAME_FILE_NAME_);
BufferedInputStream b = new BufferedInputStream(i, NAME_BUFFER_SIZE_);
DataInputStream d = new DataInputStream(b);
if (!reader.read(d, this))
throw new Exception("Data corrupted in " + NAME_FILE_NAME_);
d.close();
UNICODE_1_ = (';' >= m_tokentable_.length) ||
(m_tokentable_[(int)';'] == 0xFFFF);
}
// public method ==================================================
/**
* toString method for printing
*/
public String toString()
{
StringBuffer result = new StringBuffer("names content \n");
/*result.append(super.toString());
result.append('\n');
result.append("token string offset ");
result.append(m_tokenstringoffset_);
result.append("\n");
result.append("group offset ");
result.append(m_groupsoffset_);
result.append("\n");
result.append("group string offset ");
result.append(m_groupstringoffset_);
result.append("\n");
result.append("alg names offset ");
result.append(m_algnamesoffset_);
result.append("\n");
*/
return result.toString();
}
// protected methods ===============================================
/**
* Sets the token data
* @param token array of tokens
* @param tokenstring array of string values of the tokens
* @return false if there is a data error
*/
protected boolean setToken(char token[], byte tokenstring[])
{
if (token != null && tokenstring != null && token.length > 0 &&
tokenstring.length > 0)
{
m_tokentable_ = token;
m_tokenstring_ = tokenstring;
return true;
}
return false;
}
/**
* Sets the number of group and size of each group in number of char
* @param count number of groups
* @param size size of group in char
* @return true if group size is set correctly
*/
protected boolean setGroupCountSize(int count, int size)
{
if (count <= 0 || size <= 0)
return false;
m_groupcount_ = count;
m_groupsize_ = size;
return true;
}
/**
* Sets the group name data
* @param group index information array
* @param groupstring name information array
* @return false if there is a data error
*/
protected boolean setGroup(char group[], byte groupstring[])
{
if (group != null && groupstring != null && group.length > 0 &&
groupstring.length > 0)
{
m_groupinfo_ = group;
m_groupstring_ = groupstring;
return true;
}
return false;
}
/**
* Binary search for the group strings set that contains the argument Unicode
* code point's most significant bits.
* The return value is always a valid group string set that contain msb.
* If group string set is not found, -1 is returned
* @param ch the code point to look for
* @return group string set index in datatable otherwise -1 is returned if
* group string set is not found
*/
protected int getGroupStringIndex(int ch)
{
// gets the msb
int msb = ch >> GROUP_SHIFT_,
end = m_groupcount_,
start,
gindex = 0;
// binary search for the group of names that contains the one for code
for (start = 0; start < end - 1;)
{
gindex = (start + end) >> 1;
if (msb < getGroupMSB(gindex))
end = gindex;
else
start = gindex;
}
// return this if it is an exact match
if (msb == getGroupMSB(start))
{
start = start * m_groupsize_;
return UCharacterUtil.toInt(m_groupinfo_[start + OFFSET_HIGH_OFFSET_],
m_groupinfo_[start + OFFSET_LOW_OFFSET_]);
}
return -1;
}
/**
* Returns the number of the group information object
* @return number of group information object
*/
protected int countGroup()
{
return m_groupcount_;
}
/**
* Gets the group name of the character
* @param ch character to get the group name
* @param choice name choice selector to choose a unicode 1.0 or newer name
*/
protected String getGroupName(int ch, int choice)
{
if (choice != UCharacterNameChoiceEnum.U_UNICODE_CHAR_NAME && !UNICODE_1_)
// if not modern name requested and semicolon byte value is a character,
// not a token number, otherwise since only modern names are stored in
// unames.dat and there is no such requested Unicode 1.0 name here
return null;
// gets the msb
int msb = ch >> GROUP_SHIFT_,
end = m_groupcount_,
start,
gindex = 0;
// binary search for the group of names that contains the one for code
for (start = 0; start < end - 1;)
{
gindex = (start + end) >> 1;
if (msb < getGroupMSB(gindex))
end = gindex;
else
start = gindex;
}
// return this if it is an exact match
if (msb == getGroupMSB(start))
{
char offsets[] = new char[LINES_PER_GROUP_ + 1];
char lengths[] = new char[LINES_PER_GROUP_ + 1];
int index = getGroupLengths(start, offsets, lengths);
int offset = ch & GROUP_MASK_;
return getGroupName(index + offsets[offset], lengths[offset], choice);
}
return null;
}
/**
* Getting the character with the tokenized argument name
* @param index of the group to check
* @param name of the character
* @param choice of Unicode version used
* @return character with the tokenized argument name or -1 if character is
* not found
*/
protected int getGroupChar(int index, String name, int choice)
{
if (choice != UCharacterNameChoiceEnum.U_UNICODE_CHAR_NAME &&
!UNICODE_1_)
// semicolon byte value is a token number , therefore only modern
// names are stored in unames.dat and there is no such requested
// Unicode 1.0 name here
return -1;
// populating the data set of grouptable
char offsets[] = new char[LINES_PER_GROUP_ + 1];
char lengths[] = new char[LINES_PER_GROUP_ + 1];
int startgpstrindex = getGroupLengths(index, offsets, lengths);
// shift out to function
int result = getGroupChar(startgpstrindex, lengths, name, choice);
if (result != -1)
return (getGroupMSB(index) << GROUP_SHIFT_) | result;
return -1;
}
/**
* Set the algorithm name information array
* @param algorithm information array
* @return true if the group string offset has been set correctly
*/
protected boolean setAlgorithm(AlgorithmName alg[])
{
if (alg != null && alg.length != 0)
{
m_algorithm_ = alg;
return true;
}
return false;
}
/**
* Get the number of algorithm name groups
* @return number of algorithm name groups
*/
protected int countAlgorithm()
{
if (m_algorithm_ == null)
return 0;
return m_algorithm_.length;
}
/**
* Gets the index of the Algorithm object the argument code point lies
* @param ch code point
* @return index of the Algorithm object the argument code point lies,
* otherwise -1 if code point is not found in Algorithm objects
*/
protected int getAlgorithmIndex(int ch)
{
for (int index = m_algorithm_.length - 1; index >= 0; index --)
if (m_algorithm_[index].contains(ch))
return index;
return -1;
}
/**
* Appends algorithm name of code point into StringBuffer.
* Note this method does not check for validity of code point in Algorithm,
* result is undefined if code point does not belong in Algorithm.
* @param index of Algorithm object in array
* @param ch code point
* @param str StringBuffer to append to
*/
protected void appendAlgorithmName(int index, int ch, StringBuffer str)
{
m_algorithm_[index].appendName(ch, str);
}
/**
* Get algorithm code point for the argument name at index. If name is not
* found in algorithm, -1 is returned.
* @param index algorithm index
* @param name code point name
* @param code point in algorithm that matches name, -1 otherwise
*/
protected int getAlgorithmChar(int index, String name)
{
return m_algorithm_[index].getAlgorithmChar(name);
}
// private methods =================================================
/**
* Gets the most significant bits representation in the argument group
* @param index the indexth group in datatable
* @return most significant bits representation of group
*/
private char getGroupMSB(int index)
{
return m_groupinfo_[index * m_groupsize_];
}
/**
* Reads a block of compressed lengths of 32 strings and expands them into
* offsets and lengths for each string. Lengths are stored with a
* variable-width encoding in consecutive nibbles:
* If a nibble<0xc, then it is the length itself (0 = empty string).
* If a nibble>=0xc, then it forms a length value with the following nibble.
* The offsets and lengths arrays must be at least 33 (one more) long because
* there is no check here at the end if the last nibble is still used.
* @param index of group string object in array
* @param offsets array to store the value of the string offsets
* @param lengths array to store the value of the string length
* @return next index of the data string immediately after the lengths
* in terms of byte address
*/
private int getGroupLengths(int index, char offsets[], char lengths[])
{
char length = 0xffff;
byte b = 0,
n = 0;
int shift;
index = index * m_groupsize_; // byte count offsets of group strings
int stringoffset = UCharacterUtil.toInt(
m_groupinfo_[index + OFFSET_HIGH_OFFSET_],
m_groupinfo_[index + OFFSET_LOW_OFFSET_]);
offsets[0] = 0;
// all 32 lengths must be read to get the offset of the first group string
for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++)
{
b = m_groupstring_[stringoffset];
shift = 4;
while (shift >= 0)
{
// getting nibble
n = (byte)((b >> shift) & 0x0F);
if (length == 0xffff && n > SINGLE_NIBBLE_MAX_)
length = (char)((n - 12) << 4);
else
{
if (length != 0xffff)
lengths[i] = (char)((length | n) + 12);
else
lengths[i] = (char)n;
if (i < LINES_PER_GROUP_)
offsets[i + 1] = (char)(offsets[i] + lengths[i]);
length = 0xffff;
i ++;
}
shift -= 4;
}
}
return stringoffset;
}
/**
* Gets the name of the argument group index
* @param index of the group name string in byte count
* @param length of the group name string
* @param choice of Unicode 1.0 name or the most current name
* @return name of the group
*/
private String getGroupName(int index, int length, int choice)
{
if (choice != UCharacterNameChoiceEnum.U_UNICODE_CHAR_NAME)
{
int oldindex = index;
index += UCharacterUtil.skipByteSubString(m_groupstring_, index, length,
(byte)';');
length -= (index - oldindex);
}
StringBuffer s = new StringBuffer();
byte b;
char token;
for (int i = 0; i < length;)
{
b = m_groupstring_[index + i];
i ++;
if (b >= m_tokentable_.length)
{
if (b == ';')
break;
s.append(b); // implicit letter
}
else
{
token = m_tokentable_[b & 0x00ff];
if (token == 0xFFFE)
{
// this is a lead byte for a double-byte token
token = m_tokentable_[b << 8 | (m_groupstring_[index + i] & 0x00ff)];
i ++;
}
if (token == 0xFFFF)
{
if (b == ';')
break;
s.append((char)(b & 0x00ff)); // explicit letter
}
else // write token word
UCharacterUtil.getNullTermByteSubString(s, m_tokenstring_, token);
}
}
if (s.length() == 0)
return null;
return s.toString();
}
/**
* Compares and retrieve character if name is found within the argument
* group
* @param index index where the set of names reside in the group block
* @param length list of lengths of the strings
* @param name character name to search for
* @param choice of either 1.0 or the most current unicode name
* @return relative character in the group which matches name, otherwise if
* not found, -1 will be returned
*/
private int getGroupChar(int index, char length[], String name, int choice)
{
byte b = 0;
char token;
int len;
int namelen = name.length();
int nindex;
int count;
for (int result = 0; result <= LINES_PER_GROUP_; result ++)
{
nindex = 0;
len = length[result];
if (choice != UCharacterNameChoiceEnum.U_UNICODE_CHAR_NAME)
{
int oldindex = index;
index += UCharacterUtil.skipByteSubString(m_groupstring_, index, len,
(byte)';');
len -= (index - oldindex);
}
// number of tokens is > the length of the name
// write each letter directly, and write a token word per token
for (count = 0; count < len && nindex != -1 && nindex < namelen;)
{
b = m_groupstring_[index + count];
count ++;
if (b >= m_tokentable_.length)
{
if (name.charAt(nindex ++) != (b & 0xFF))
nindex = -1;
}
else
{
token = m_tokentable_[b & 0xFF];
if (token == 0xFFFE)
{
// this is a lead byte for a double-byte token
token = m_tokentable_[b << 8 |
(m_groupstring_[index + count] & 0x00ff)];
count ++;
}
if (token == 0xFFFF)
{
if (name.charAt(nindex ++) != (b & 0xFF))
nindex = -1;
}
else
// compare token with name
nindex = UCharacterUtil.compareNullTermByteSubString(name,
m_tokenstring_, nindex, token);
}
}
if (namelen == nindex &&
(count == len || m_groupstring_[index + count] == ';'))
return result;
index += len;
}
return -1;
}
// protected inner class ===========================================
/**
* Algorithmic name class
*/
static final class AlgorithmName
{
// protected variables ===========================================
/**
* Constant type value of the different AlgorithmName
*/
protected static final int TYPE_0_ = 0;
protected static final int TYPE_1_ = 1;
// private variables =============================================
/**
* Algorithmic data information
*/
private int m_rangestart_;
private int m_rangeend_;
private byte m_type_;
private byte m_variant_;
private char m_factor_[];
private String m_prefix_;
private byte m_factorstring_[];
// constructor ===================================================
/**
* Constructor
*/
protected AlgorithmName()
{
}
// protected methods =============================================
/**
* Sets the information for accessing the algorithmic names
* @param rangestart starting code point that lies within this name group
* @param rangeend end code point that lies within this name group
* @param type algorithm type. There's 2 kinds of algorithmic type. First
* which uses code point as part of its name and the other uses
* variant postfix strings
* @param variant algorithmic variant
* @return true if values are valid
*/
protected boolean setInfo(int rangestart, int rangeend, byte type,
byte variant)
{
if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend &&
rangeend <= UCharacter.MAX_VALUE &&
(type == TYPE_0_ || type == TYPE_1_))
{
m_rangestart_ = rangestart;
m_rangeend_ = rangeend;
m_type_ = type;
m_variant_ = variant;
return true;
}
return false;
}
/**
* Sets the factor data
* @param array of factor
* @return true if factors are valid
*/
protected boolean setFactor(char factor[])
{
if (factor.length == m_variant_)
{
m_factor_ = factor;
return true;
}
return false;
}
/**
* Sets the name prefix
* @param prefix
* @return true if prefix is set
*/
protected boolean setPrefix(String prefix)
{
if (prefix != null && prefix.length() > 0)
{
m_prefix_ = prefix;
return true;
}
return false;
}
/**
* Sets the variant factorized name data
* @param string variant factorized name data
* @return true if values are set
*/
protected boolean setFactorString(byte string[])
{
// factor and variant string can be empty for things like hanggul code
// points
m_factorstring_ = string;
return true;
}
/**
* Checks if code point lies in Algorithm object at index
* @param ch code point
*/
protected boolean contains(int ch)
{
return m_rangestart_ <= ch && ch <= m_rangeend_;
}
/**
* Appends algorithm name of code point into StringBuffer.
* Note this method does not check for validity of code point in Algorithm,
* result is undefined if code point does not belong in Algorithm.
* @param ch code point
* @param str StringBuffer to append to
*/
protected void appendName(int ch, StringBuffer str)
{
str.append(m_prefix_);
switch (m_type_)
{
case TYPE_0_:
// prefix followed by hex digits indicating variants
str.append(Integer.toHexString(ch));
break;
case TYPE_1_:
// prefix followed by factorized-elements
int offset = ch - m_rangestart_;
int indexes[] = new int[m_variant_];
int factor;
// write elements according to the factors
// the factorized elements are determined by modulo arithmetic
for (int i = m_variant_ - 1; i > 0; i --)
{
factor = m_factor_[i] & 0x00FF;
indexes[i] = offset % factor;
offset /= factor;
}
// we don't need to calculate the last modulus because
// start <= code <= end guarantees here that code <= factors[0]
indexes[0] = offset;
// joining up the factorized strings
String s[] = getFactorString(indexes);
if (s != null && s.length > 0)
{
int size = s.length;
for (int i = 0; i < size; i ++)
str.append(s[i]);
}
break;
}
}
/**
* Gets the character for the argument algorithmic name
* @return the algorithmic char or -1 otherwise.
*/
protected int getAlgorithmChar(String name)
{
int prefixlen = m_prefix_.length();
if (name.length() < prefixlen ||
!m_prefix_.equals(name.substring(0, prefixlen)))
return -1;
switch (m_type_)
{
case TYPE_0_ :
try
{
int result = Integer.parseInt(name.substring(prefixlen), 16);
// does it fit into the range?
if (m_rangestart_ <= result && result <= m_rangeend_)
return result;
}
catch (NumberFormatException e)
{
}
break;
case TYPE_1_ :
// repetitative suffix name comparison done here
// offset is the character code - start
for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++)
{
int offset = ch - m_rangestart_;
int indexes[] = new int[m_variant_];
int factor;
// write elements according to the factors
// the factorized elements are determined by modulo arithmetic
for (int i = m_variant_ - 1; i > 0; i --)
{
factor = m_factor_[i] & 0x00FF;
indexes[i] = offset % factor;
offset /= factor;
}
// we don't need to calculate the last modulus because
// start <= code <= end guarantees here that code <= factors[0]
indexes[0] = offset;
// joining up the factorized strings
if (compareFactorString(indexes, name.substring(prefixlen)))
return ch;
}
}
return -1;
}
// private methods ================================================
/**
* Gets the indexth string in each of the argument factor block
* @param index array with each index corresponding to each factor block
* @return array of indexth factor string in factor block
*/
private String[] getFactorString(int index[])
{
int size = m_factor_.length;
if (index == null || index.length != size)
return null;
String result[] = new String[size];
StringBuffer str = new StringBuffer();
int count = 0;
int factor;
size --;
for (int i = 0; i <= size; i ++)
{
factor = m_factor_[i];
count = UCharacterUtil.skipNullTermByteSubString(m_factorstring_,
count, index[i]);
count = UCharacterUtil.getNullTermByteSubString(str, m_factorstring_,
count);
if (i != size)
count = UCharacterUtil.skipNullTermByteSubString(m_factorstring_,
count, factor - index[i] - 1);
result[i] = str.toString();
str.delete(0, str.length());
}
return result;
}
/**
* Compares the indexth string in each of the argument factor block with
* the argument string
* @param index array with each index corresponding to each factor block
* @param str string to compare with
* @return true if string matches
*/
private boolean compareFactorString(int index[], String str)
{
int size = m_factor_.length;
if (index == null || index.length != size)
return false;
int count = 0;
int strcount = 0;
int factor;
size --;
for (int i = 0; i <= size; i ++)
{
factor = m_factor_[i];
count = UCharacterUtil.skipNullTermByteSubString(m_factorstring_,
count, index[i]);
strcount = UCharacterUtil.compareNullTermByteSubString(str,
m_factorstring_, strcount, count);
if (strcount < 0)
return false;
if (i != size)
count = UCharacterUtil.skipNullTermByteSubString(m_factorstring_,
count, factor - index[i]);
}
if (strcount != str.length())
return false;
return true;
}
}
}

View file

@ -0,0 +1,426 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterPropertyDB.java $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.InputStream;
import java.io.DataInputStream;
import java.io.BufferedInputStream;
/**
* Internal class used for Unicode character property database.
* Database classes store binary data read from uprops.dat and unames for use.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.
* Due to the form most commonly used for retrieval, array of char is used
* to store the binary data
* UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.
* Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a> and
* <a href=UCharacterName.html>UCharacterName</a>.
* Data populated by <a href=UGenPropReader.html>UGenPropReader</a>
* @author Syn Wee Quek
* @since oct1000
* @see com.ibm.icu.text.UGenReader
*/
final class UCharacterPropertyDB extends UCharacterDB
{
// protected variables ================================================
/**
* Data type indicators
*/
protected static final int EXC_UPPERCASE_ = 0;
protected static final int EXC_LOWERCASE_ = EXC_UPPERCASE_ + 1;
protected static final int EXC_TITLECASE_ = EXC_LOWERCASE_ + 1;
protected static final int EXC_DIGIT_VALUE_ = EXC_TITLECASE_ + 1;
protected static final int EXC_NUMERIC_VALUE_ = EXC_DIGIT_VALUE_ + 1;
protected static final int EXC_DENOMINATOR_VALUE_ = EXC_NUMERIC_VALUE_ + 1;
protected static final int EXC_MIRROR_MAPPING_ = EXC_DENOMINATOR_VALUE_ + 1;
// private variables ==================================================
/**
* Number of bits to shift right to get the correct segment of bits out for
* index to the unicode database
*/
private int m_stage1shift_;
private int m_stage2shift_;
/**
* Mask for performing on the bit segment after shifting to get an index out
* of it
*/
private int m_stage2maskaftershift_;
private int m_stage3maskaftershift_;
/**
* Table for stages data block
*/
private char m_stages_[];
/**
* Character property table
*/
private int m_property_[];
/**
* Exception property table
*/
private int m_exception_[];
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "uprops.dat";
/**
* Default buffer size of datafile
*/
private static final int DATA_BUFFER_SIZE_ = 25000;
/**
* This, from what i infer is the max size of the indicators used for the
* exception values.
* Number of bits in an 8-bit integer value
*/
private static final int EXC_GROUP_ = 8;
/**
* Mask to get the group
*/
private static final int EXC_GROUP_MASK_ = 255;
/**
* Mask to get the digit value in the exception result
*/
private static final int EXC_DIGIT_MASK_ = 0xFFFF;
/**
* Offset table for data in exception block.<br>
* Table formed by the number of bits used for the index, e.g. 0 = 0 bits,
* 1 = 1 bits.
*/
private static final byte FLAGS_OFFSET_[] =
{
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
/**
* Numeric value shift
*/
private static final int VALUE_SHIFT_ = 20;
/**
* Since character information data are packed together.
* This is the category mask for getting the category information
*/
private static final int CATEGORY_MASK_ = 0x1F;
/**
* Exception test mask
*/
private static final int EXCEPTION_MASK_ = 0x20;
/**
* Mask to be applied after shifting to obtain an unsigned numeric value
*/
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0x7FF;
/**
* Mirror test mask
*/
private static final int MIRROR_MASK_ = 0x800;
/**
* Shift to get bidi bits
*/
private static final int BIDI_SHIFT_ = 6;
/**
* Mask to be applied after shifting to get bidi bits
*/
private static final int BIDI_MASK_AFTER_SHIFT_ = 0x1F;
// constructor ======================================================
/**
* Constructor
* @exception thrown when data reading fails or data corrupted
*/
protected UCharacterPropertyDB() throws Exception
{
UGenPropReader reader = new UGenPropReader();
InputStream i = getClass().getResourceAsStream(DATA_FILE_NAME_);
BufferedInputStream b = new BufferedInputStream(i, DATA_BUFFER_SIZE_);
DataInputStream d = new DataInputStream(b);
if (!reader.read(d, this))
throw new Exception("Data corrupted in " + DATA_FILE_NAME_);
d.close();
}
// public methods ===================================================
/**
* toString method for printing
*/
public String toString()
{
StringBuffer result = new StringBuffer("Property block\n");
result.append(super.toString());
result.append("\nshift 1 : ");
result.append(m_stage1shift_);
result.append("\nshift 2 : ");
result.append(m_stage2shift_);
result.append("\nmask 2 : ");
result.append(m_stage2maskaftershift_);
result.append("\nmask 3 : ");
result.append(m_stage3maskaftershift_);
result.append("\nsize of stage data ");
result.append(m_stages_.length);
result.append("\nsize of property data ");
result.append(m_property_.length);
result.append("\nsize of exception data ");
result.append(m_exception_.length);
return result.toString();
}
// protected methods ================================================
/**
* Set stage shift bits, mask and property offset
* @param stage1shift count
* @param stage2shift count
* @param stage2mask count
* @param stage3mask count
* @param offset property block offset
* @return false if there is a data error
*/
protected boolean setInfo(int stage1shift, int stage2shift, int stage2mask,
int stage3mask)
{
if (stage1shift >= 0 && stage2shift >= 0 && stage2mask != 0 &&
stage3mask != 0)
{
m_stage1shift_ = stage1shift;
m_stage2shift_ = stage2shift;
m_stage2maskaftershift_ = stage2mask;
m_stage3maskaftershift_ = stage3mask;
return true;
}
return false;
}
/**
* Set the stages block data. The first UGenPropReader.INDEX_SIZE char of data
* being some other data not used from hence onwards. Note the unused data
* resides since all indexes are relative to it.
* @param stages array containing the 2 stages of index pointing to property
* data
* @return true if stages data is set successfully
*/
protected boolean setStage(char stages[])
{
if (stages == null || stages.length <= 0)
return false;
m_stages_ = stages;
return true;
}
/**
* Set the property block data.
* @param property array containing data regarding the character properties
* @return true if stages data is set successfully
*/
protected boolean setProperty(int property[])
{
if (property == null || property.length <= 0)
return false;
m_property_ = property;
return true;
}
/**
* Set the exception block data.
* @param exception array containing extra character properties not found in
* property array
* @return true if stages data is set successfully
*/
protected boolean setException(int exception[])
{
if (exception == null || exception.length <= 0)
return false;
m_exception_ = exception;
return true;
}
/**
* Gets the property value at the index
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
protected int getProperty(int ch)
{
// index of the first access to the database
int index1 = ch >> m_stage1shift_;
// index of the second access to the database
int index2 = m_stages_[index1] +
((ch >> m_stage2shift_) & m_stage2maskaftershift_);
// index of the third access to the database
int index3 = m_stages_[index2] + (ch & m_stage3maskaftershift_);
int propindex = m_stages_[index3];
return m_property_[propindex];
}
/**
* Determines if the exception value passed in has the kind of information
* which the indicator wants, e.g if the exception value contains the digit
* value of the character
* @param index exception index
* @param indicator type indicator
* @return true if type value exist
*/
protected boolean hasExceptionValue(int index, int indicator)
{
return (m_exception_[index] & (1 << indicator)) != 0;
}
/**
* Gets the exception value at the index, assuming that data type is
* available. Result is undefined if data is not available. Use
* hasExceptionValue() to determine data's availability.
* @param index
* @param exception data type
* @return exception data type value at index
*/
protected int getException(int index, int etype)
{
// contained in exception data
int evalue = m_exception_[index];
index ++;
// contained in the exception digit address
index = addExceptionOffset(evalue, etype, index);
if (etype == EXC_DIGIT_VALUE_)
return m_exception_[index] & EXC_DIGIT_MASK_;
return m_exception_[index];
}
/**
* Returns a value indicating a character category from the argument property
* value
* @param unicode character property
* @return category
*/
protected static int getPropType(int prop)
{
int result = prop & CATEGORY_MASK_;
return result;
}
/**
* Determines if the argument props indicates that the exception block has
* to be accessed for data
* @param props property value
* @return true if this is an exception indicator false otherwise
*/
protected static boolean isExceptionIndicator(int props)
{
if ((props & EXCEPTION_MASK_) != 0)
return true;
return false;
}
/**
* Getting the exception index for argument property
* @param prop character property
*/
protected static int getExceptionIndex(int prop)
{
return getSignedValue(prop) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
}
/**
* Getting the signed numeric value of a character embedded in the property
* argument
* @param prop the character
* @return signed numberic value
*/
protected static int getSignedValue(int prop)
{
return (prop >> VALUE_SHIFT_);
}
/**
* Checking if property indicates mirror element
* @param prop property value
* @return true if mirror indicator is set, false otherwise
*/
protected static boolean isMirrored(int prop)
{
return (prop & MIRROR_MASK_) != 0;
}
/**
* Getting the direction data in the property value
* @param prop property value
* @return direction value in property
*/
protected static int getDirection(int prop)
{
return (prop >> BIDI_SHIFT_) & BIDI_MASK_AFTER_SHIFT_;
}
// private methods ===============================================
/**
* Getting the correct address for data in the exception value
* @param evalue exception value
* @param indicator type of data to retrieve
* @param address current address to move from
* @return the correct address
*/
private int addExceptionOffset(int evalue, int indicator, int address)
{
int result = address;
if (indicator >= EXC_GROUP_)
result += (FLAGS_OFFSET_[evalue & EXC_GROUP_MASK_] << 1);
// evalue >>= EXC_GROUP_;
// indicator -= EXC_GROUP_;
else
{
int mask = (1 << indicator) - 1;
result += FLAGS_OFFSET_[evalue & mask];
}
return result;
}
}

View file

@ -0,0 +1,265 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UCharacterUtil.java,v $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* Internal character utility class for simple data type conversion and String
* parsing functions. Does not have an analog in the JDK.
* @author Syn Wee Quek
* @since sep2900
*/
final class UCharacterUtil
{
// constructor =====================================================
/**
* private constructor to avoid initialisation
*/
private UCharacterUtil()
{
}
// protected methods ===============================================
/**
* joining 2 chars to form an int
* @param msc most significant char
* @param lsc least significant char
* @return int form
*/
protected static int toInt(char msc, char lsc)
{
return ((msc << 16) | lsc);
}
/**
* converting first 2 bytes of a byte array into char
* if array size is < 2 then algorithmn will only return value == 1 byte
* @param bytes 2 byte argument
* @return char form
*/
protected static char toChar(byte bytes[])
{
if (bytes == null || bytes.length == 0)
return 0;
if (bytes.length == 1)
return toChar(bytes[0]);
char firstbyte = (char)(toChar(bytes[0]) << 8);
char secondbyte = toChar(bytes[1]);
return (char)(firstbyte | secondbyte);
}
/**
* converting 2 bytes into a char
* @param msb the most significant byte
* @param lsb the least significant byte
* @return char form
*/
protected static char toChar(byte msb, byte lsb)
{
char firstbyte = (char)(toChar(msb) << 8);
char secondbyte = toChar(lsb);
return (char)(firstbyte | secondbyte);
}
/**
* converting first 4 bytes of a byte array into int
* if array size is < 4 then algorithmn will only return value == # bytes
* @param bytes 4 byte argument
* @return int form
*/
protected static int toInt(byte bytes[])
{
if (bytes == null || bytes.length == 0)
return 0;
int size = bytes.length;
if (size > 4)
size = 4;
int result = 0;
for (int i = 0; i < size; i ++)
result = (result << 8) | (0x000000FF & bytes[i]);
return result;
}
/**
* converting a byte into char
* @param onebyte
* @return char form
*/
protected static char toChar(byte onebyte)
{
char result = (char)(onebyte & 0x000000ff);
return result;
}
/**
* converting a integer to a array of 4 characters where each character
* corresponds to its respective byte
* @param i integer to be converted
* @return array of 4 characters
*/
protected static char[] to4Char(int i)
{
char result[] = new char[4];
result[0] = (char)((i >> 24) & 0xFF);
result[1] = (char)((i & 0x00FF0000) >> 16);
result[2] = (char)((i & 0x0000FF00) >> 8);
result[3] = (char)(i & 0xFF);
return result;
}
/**
* Retrieves a null terminated substring from an array of bytes.
* Substring is a set of non-zero bytes starting from argument start to the
* next zero byte. If the first byte is a zero, the next byte will be taken as
* the first byte.
* @param str stringbuffer to store data in, data will be store with each
* byte as a char
* @param array byte array
* @param index to start substring in byte count
* @return the end position of the substring within the character array
*/
protected static int getNullTermByteSubString(StringBuffer str, byte[] array,
int index)
{
byte b = 1;
while (b != 0)
{
b = array[index];
if (b != 0)
str.append((char)(b & 0x00FF));
index ++;
}
return index;
}
/**
* Compares a null terminated substring from an array of bytes.
* Substring is a set of non-zero bytes starting from argument start to the
* next zero byte. if the first byte is a zero, the next byte will be taken as
* the first byte.
* @param str string to compare
* @param array byte array
* @param strindex index within str to start comparing
* @param aindex array index to start in byte count
* @return the end position of the substring within str if matches otherwise
* a -1
*/
protected static int compareNullTermByteSubString(String str, byte[] array,
int strindex, int aindex)
{
byte b = 1;
int length = str.length();
while (b != 0)
{
b = array[aindex];
aindex ++;
if (b == 0)
break;
// if we have reached the end of the string and yet the array has not
// reached the end of their substring yet, abort
if (strindex == length || (str.charAt(strindex) != (char)(b & 0xFF)))
return -1;
strindex ++;
}
return strindex;
}
/**
* Skip null terminated substrings from an array of bytes.
* Substring is a set of non-zero bytes starting from argument start to the
* next zero byte. If the first byte is a zero, the next byte will be taken as
* the first byte.
* @param array byte array
* @param index to start substrings in byte count
* @param skipcount number of null terminated substrings to skip
* @return the end position of the substrings within the character array
*/
protected static int skipNullTermByteSubString(byte[] array, int index,
int skipcount)
{
byte b;
for (int i = 0; i < skipcount; i ++)
{
b = 1;
while (b != 0)
{
b = array[index];
index ++;
}
}
return index;
}
/**
* skip substrings from an array of characters, where each character is a set
* of 2 bytes. substring is a set of non-zero bytes starting from argument
* start to the byte of the argument value. skips up to a max number of
* characters
* @param array byte array to parse
* @param index to start substrings in byte count
* @param length the max number of bytes to skip
* @param skipend value of byte to skip to
* @return the number of bytes skipped
*/
protected static int skipByteSubString(byte[] array, int index, int length,
byte skipend)
{
int result;
byte b;
for (result = 0; result < length; result ++)
{
b = array[index + result];
if (b == skipend)
{
result ++;
break;
}
}
return result;
}
/**
* skip substrings from an array of characters, where each character is a set
* of 2 bytes. substring is a set of non-zero bytes starting from argument
* start to the byte of the argument value.
* @param array byte array to parse
* @param index to start substrings in byte count
* @param skipend value of byte to skip to
* @return the number of bytes skipped
*/
protected static int skipByteSubString(byte[] array, int index, byte skipend)
{
int result = 0;
byte b;
while (true)
{
b = array[index + result];
result ++;
if (b == skipend)
break;
}
return result;
}
}

View file

@ -0,0 +1,275 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UGenNameReader.java,v $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.DataInputStream;
/**
* Internal reader class reading binary data from unames.dat created by ICU
* programs gennames.
* It arranges the header and index data apart into meaningful data before
* populating <a href=UCharacterNameDB.html>UCharacterNameDB</a>. UGenNameReader
* does not have or require the ability to decipher the rest of the data in
* unames.dat and hence stores it as a block of data in an array of char in
* <a href=UCharacterNameDB.html>UCharacterNameDB</a>. The ability to decipher
* the block of data lies in <a href=UCharacterName.html>UCharacterName</a>.
* For more information about the format of unames.dat refer to
* <a href=oss.software.ibm.com/icu4j/icu4jhtml/com/ibm/icu/text/readme.html>
* ReadMe</a>.<br>
* unames.dat which is in big-endian format is jared together with this package.
* @author Syn Wee Quek
* @since oct1000
*/
final class UGenNameReader extends UGenReader
{
// private variables ===========================================
/**
* Size of the group information block in number of char
*/
private static final int GROUP_INFO_SIZE_ = 3;
/**
* Index of the offset information
*/
private int m_tokenstringindex_;
private int m_groupindex_;
private int m_groupstringindex_;
private int m_algnamesindex_;
/**
* Size of an algorithmic name information group
* start code point size + end code point size + type size + variant size +
* size of data size
*/
private static final int ALG_INFO_SIZE_ = 12;
/**
* File format version and id that this class understands.
* No guarantees are made if a older version is used
*/
private static final byte DATA_FORMAT_VERSION_[] =
{(byte)0x1, (byte)0x0, (byte)0x0, (byte)0x0};
private static final byte DATA_FORMAT_ID_[] = {(byte)0x75, (byte)0x6E,
(byte)0x61, (byte)0x6D};
// constructor ==================================================
/**
* Constructor
*/
protected UGenNameReader()
{
}
// protected methods ============================================
/**
* Read and break up the stream of data passed in as arguments
* and fills up UCharacterNameDB.
* If unsuccessful false will be returned.
* @param input data input stream
* @param data instance of datablock
* @return true if successfully filled UCharacterNameDB
* @exception thrown if there is a failure reading file
*/
protected boolean read(DataInputStream input, UCharacterNameDB data)
throws Exception
{
if (super.read(input, data))
{
// read the indexes
if (readIndex(input) && readToken(input, data) && readGroup(input, data)
&& readAlg(input, data))
return true;
}
return false;
}
/**
* Checking the file for the correct format
* @param dataformatid
* @param dataformatversion
* @return true if the file format version is correct
*/
protected boolean authenticate(byte dataformatid[],
byte dataformatversion[])
{
int size = DATA_FORMAT_ID_.length;
for (int i = 0; i < size; i ++)
if (DATA_FORMAT_ID_[i] != dataformatid[i])
return false;
size = DATA_FORMAT_VERSION_.length;
for (int i = 0; i < size; i ++)
if (DATA_FORMAT_VERSION_[i] != dataformatversion[i])
return false;
return true;
}
/**
* Gets the size of the file id version
* @return size of file format version in bytes
*/
protected int getFileFormatIDSize()
{
return DATA_FORMAT_ID_.length;
}
/**
* Gets the size of the file format version
* @return size of file format version in bytes
*/
protected int getFileFormatVersionSize()
{
return DATA_FORMAT_VERSION_.length;
}
// private methods =========================================
/**
* Read the indexes
* @param input data stream
* @return true if successfully read
* @exception thrown when data reading fails
*/
private boolean readIndex(DataInputStream input) throws Exception
{
m_tokenstringindex_ = input.readInt();
m_groupindex_ = input.readInt();
m_groupstringindex_ = input.readInt();
m_algnamesindex_ = input.readInt();
return true;
}
/**
* Read the tokens
* @param input data stream
* @param data instance of UCharacterName to populate
* @return true if successfully read
* @exception thrown when data reading fails
*/
private boolean readToken(DataInputStream input, UCharacterNameDB data)
throws Exception
{
char count = input.readChar();
char token[] = new char[count];
for (char i = 0; i < count; i ++)
token[i] = input.readChar();
int size = m_groupindex_ - m_tokenstringindex_;
byte tokenstr[] = new byte[size];
input.readFully(tokenstr);
return data.setToken(token, tokenstr);
}
/**
* Read the groups
* @param input data stream
* @param data instance of UCharacterName to populate
* @return true if successfully read
* @exception thrown when data reading fails
*/
private boolean readGroup(DataInputStream input, UCharacterNameDB data)
throws Exception
{
// reading the group information records
int count = input.readChar();
data.setGroupCountSize(count, GROUP_INFO_SIZE_);
count *= GROUP_INFO_SIZE_;
char group[] = new char[count];
for (int i = 0; i < count; i ++)
group[i] = input.readChar();
int size = m_algnamesindex_ - m_groupstringindex_;
byte groupstring[] = new byte[size];
input.readFully(groupstring);
return data.setGroup(group, groupstring);
}
/**
* Read the algorithmic names
* @param input data stream
* @param data instance of UCharacterName to populate
* @return true if successfully read
* @exception thrown when data reading fails
*/
private boolean readAlg(DataInputStream input, UCharacterNameDB data)
throws Exception
{
int count = input.readInt();
UCharacterNameDB.AlgorithmName alg[] =
new UCharacterNameDB.AlgorithmName[count];
for (int i = 0; i < count; i ++)
{
UCharacterNameDB.AlgorithmName an = readAlg(input);
if (an == null)
return false;
alg[i] = an;
}
data.setAlgorithm(alg);
return true;
}
/**
* Reads an individual record of AlgorithmNames
* @param input stream
* @return an instance of AlgorithNames if read is successful otherwise null
* @exception thrown when file read error occurs or data is corrupted
*/
private UCharacterNameDB.AlgorithmName readAlg(DataInputStream input)
throws Exception
{
UCharacterNameDB.AlgorithmName result =
new UCharacterNameDB.AlgorithmName();
int rangestart = input.readInt();
int rangeend = input.readInt();
byte type = input.readByte();
byte variant = input.readByte();
if (!result.setInfo(rangestart, rangeend, type, variant))
return null;
int size = input.readChar();
if (type == UCharacterNameDB.AlgorithmName.TYPE_1_)
{
char factor[] = new char[variant];
for (int j = 0; j < variant; j ++)
factor[j] = input.readChar();
result.setFactor(factor);
size -= (variant << 1);
}
StringBuffer prefix = new StringBuffer();
char c = (char)(input.readByte() & 0x00FF);
while (c != 0)
{
prefix.append(c);
c = (char)(input.readByte() & 0x00FF);
}
result.setPrefix(prefix.toString());
size -= (ALG_INFO_SIZE_ + prefix.length() + 1);
if (size > 0)
{
byte string[] = new byte[size];
input.readFully(string);
result.setFactorString(string);
}
return result;
}
}

View file

@ -0,0 +1,263 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UGenPropReader.java,v $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.DataInputStream;
/**
* Internal reader class reading binary data from uprops.dat created by ICU
* programs genprops.
* It arranges the header and index data apart into meaningful data before
* populating <a href=UCharacterPropDB.html>UCharacterPropDB</a>. UGenPropReader
* does not have or require the ability to decipher the rest of the data in
* uprop.dat and hence stores it as a block of data in an array of char in
* <a href=UCharacterPropDB.html>UCharacterPropDB</a>. The ability to decipher
* the block of data lies in <a href=UCharacterProp.html>UCharacterProp</a>.
* For more information about the format of uprops.dat refer to
* <a href=oss.software.ibm.com/icu4j/icu4jhtml/com/ibm/icu/text/readme.html>
* ReadMe</a>.<br>
* uprops.dat which is in big-endian format is jared together with this package.
* @author Syn Wee Quek
* @since oct0200
*/
final class UGenPropReader extends UGenReader
{
// private variables ===========================================
/**
* Index size
*/
private static final int INDEX_SIZE_ = 8;
/**
* Elements in the index where addresses are in number of chars.
* Size is basically the count and does not depend on the type.
*/
private char m_stage2indexsize_;
private char m_stage3indexsize_;
private int m_exception_;
private char m_stage3_;
private int m_prop_;
private char m_end_;
/**
* Size of actual number of bits used in surrogate unicode character
*/
private static final int USED_SURROGATE_BIT_SIZE_ = 21;
/**
* File format version that this class understands.
* No guarantees are made if a older version is used
*/
private static final byte DATA_FORMAT_ID_[] = {(byte)0x55, (byte)0x50,
(byte)0x72, (byte)0x6F};
private static final byte DATA_FORMAT_VERSION_[] =
{(byte)0x1, (byte)0x1, (byte)0x0, (byte)0x0};
// constructor =============================================
/**
* Constructor
*/
protected UGenPropReader()
{
}
// protected methods ==================================================
/**
* Read and fills up UCharacterPptyDB.
* If unsuccessful false will be returned
* @param input data stream
* @param data data instance
* @return true if successfully filled
* @exception thrown when data reading fails
*/
protected boolean read(DataInputStream input, UCharacterPropertyDB data)
throws Exception
{
if (super.read(input, data))
{
// read the indexes
if (readIndex(input, data) &&
// read the stages block
readStage(input, data) &&
// read the property data
readProperty(input, data) &&
// read the exception data
readException(input, data))
{
return true;
}
}
return false;
}
/**
* Checking the file for the correct format
* @param dataformatid
* @param dataformatversion
* @return true if the file format version is correct
*/
protected boolean authenticate(byte dataformatid[],
byte dataformatversion[])
{
int size = DATA_FORMAT_ID_.length;
for (int i = 0; i < size; i ++)
if (DATA_FORMAT_ID_[i] != dataformatid[i])
return false;
size = DATA_FORMAT_VERSION_.length;
for (int i = 0; i < size; i ++)
if (DATA_FORMAT_VERSION_[i] != dataformatversion[i])
return false;
return true;
}
/**
* Gets the size of the file format version
* @return size of file format version in bytes
*/
protected int getFileFormatVersionSize()
{
return DATA_FORMAT_VERSION_.length;
}
/**
* Gets the size of the file format id
* @return size of file format id in bytes
*/
protected int getFileFormatIDSize()
{
return DATA_FORMAT_ID_.length;
}
// private methods ===================================================
/**
* Read the INDEX_SIZE_ indexes and updates the instance of
* UCharacterPropertyDB with the processed shifts and mask
* @param input data stream
* @param data instance of UCharacterPropertyDB
* @return true if successfully read
* @exception thrown when data reading fails
*/
private boolean readIndex(DataInputStream input, UCharacterPropertyDB data)
throws Exception
{
int count = INDEX_SIZE_;
m_stage2indexsize_ = input.readChar();
count --;
m_stage3indexsize_ = input.readChar();
count --;
m_exception_ = input.readChar();
count --;
m_stage3_ = input.readChar();
count --;
m_prop_ = input.readChar();
count --;
m_end_ = input.readChar();
count --;
input.skipBytes(count << 1);
return data.setInfo(m_stage3indexsize_ + m_stage2indexsize_,
(int)m_stage3indexsize_,
(1 << m_stage2indexsize_) - 1,
(1 << m_stage3indexsize_) - 1);
}
/**
* Read the stages block and updates the instance of UCharacterPropertyDB
* with the stages data
* @param input data stream
* @param data instance of UCharacterPropertyDB
* @return true if successfully read
* @exception thrown when data reading fails
*/
private boolean readStage(DataInputStream input, UCharacterPropertyDB data)
throws Exception
{
// size of the 3 stages
int stagesize = (m_prop_ << 1) - INDEX_SIZE_;
char array[] = new char[stagesize];
int max = 0;
int props = m_prop_ - INDEX_SIZE_;
// setting up the propery index for stage 1 to 3
for (int count = 0; count < stagesize; count ++)
{
array[count] = (char)(input.readChar() - INDEX_SIZE_);
if (max < array[count] && count < 0x448)
max = array[count];
// setting up the property index for stage 3
// uprops.dat contain data that includes the address from the top of
// index to property data. since the blocks are split up, so now i have
// to subtract the excess address from it.
if (count >= m_stage3_ - INDEX_SIZE_)
array[count] -= props;
}
// synwee : hmm... gaps in stage 2.
/*
System.out.println("stage 3 " + (int)m_stage3_);
System.out.println("stage 2 top " + (max - 0x440 - INDEX_SIZE_));
*/
// setting up the stages block in the instance of UCharacterPropertyDB
return data.setStage(array);
}
/**
* Read the propery data block and updates the instance of
* UCharacterPropertyDB with the data
* @param input data stream
* @param data instance of UCharacterPropertyDB
* @return true if successfully read
* @exception thrown when data reading fails
*/
private boolean readProperty(DataInputStream input,
UCharacterPropertyDB data) throws Exception
{
// getting size of the property block
int size = m_exception_ - m_prop_;
int ppty[] = new int[size];
for (int i = 0; i < size; i ++)
ppty[i] = input.readInt();
// setting up the property block in the instance of UCharacterPropertyDB
return data.setProperty(ppty);
}
/**
* Read the exception data block and updates the instance of
* UCharacterPropertyDB with the data
* @param input data stream
* @param data instance of UCharacterPropertyDB
* @return true if successfully read
* @exception thrown when data reading fails
*/
private boolean readException(DataInputStream input,
UCharacterPropertyDB data) throws Exception
{
int size = m_end_ - m_exception_;
int exception[] = new int[size];
for (int i = 0; i < size; i ++)
exception[i] = input.readInt();
// setting up the property block in the instance of UCharacterPropertyDB
return data.setException(exception);
}
}

View file

@ -0,0 +1,192 @@
/**
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UGenReader.java,v $
* $Date: 2000/12/26 20:00:56 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.DataInputStream;
import java.io.EOFException;
/**
* Internal parent reader class reading binary header data from uprops.dat and
* unames.dat, created by ICU programs genprops and gennames.
* It arranges the header data into meaningful data before
* populating <a href=UCharacterDB.html>UCharacterDB</a>. It also authenticates
* that the data files before proceeding on.
* For more information about the format of uprops.dat refer to
* <a href=oss.software.ibm.com/icu4j/icu4jhtml/com/ibm/icu/text/readme.html>
* ReadMe</a>.<br>
* uprops.data and unames.dat which are in big-endian format are jared together
* with this package.
* @author Syn Wee Quek
* @since oct1000
*/
abstract class UGenReader
{
// private variables ===========================================
/**
* Magic numbers to authenticate the data file
*/
private static final byte MAGIC1 = (byte)0xda;
private static final byte MAGIC2 = (byte)0x27;
/**
* Size of the field datasize and reservedcharacter
*/
private static final int SKIP_BYTES_ = 4;
/**
* File format authentication values
*/
private static final byte BIG_ENDIAN_ = 1;
private static final byte CHAR_SET_ = 0;
private static final byte CHAR_SIZE_ = 2;
private static final byte UNICODE_VERSION_[] = {(byte)0x3, (byte)0x0,
(byte)0x0, (byte)0x0};
// constructor =================================================
/**
* Protected constructor
*/
protected UGenReader()
{
}
// protected methods ===========================================
/**
* Read the data header and fills the relevant information into UCharacterDB.
* If unsuccessful false will be returned
* @param input data stream
* @param data data instance
* @return true if successfully filled
* @exception thrown when error reading data
*/
protected boolean read(DataInputStream input, UCharacterDB data)
throws Exception
{
char headersize = input.readChar();
headersize -= 2;
//reading the header format
byte magic1 = input.readByte();
headersize --;
byte magic2 = input.readByte();
headersize --;
input.skipBytes(SKIP_BYTES_);
headersize -= SKIP_BYTES_;
if (authenticate(magic1, magic2))
{
byte bigendian = input.readByte();
headersize --;
byte charset = input.readByte();
headersize --;
byte charsize = input.readByte();
headersize --;
byte reserved = input.readByte();
headersize --;
byte dataformatid[] = new byte[getFileFormatIDSize()];
input.readFully(dataformatid);
headersize -= getFileFormatIDSize();
byte dataformatversion[] = new byte[getFileFormatVersionSize()];
input.readFully(dataformatversion);
headersize -= getFileFormatVersionSize();
byte unicodeversion[] = new byte[UNICODE_VERSION_.length];
input.readFully(unicodeversion);
headersize -= UNICODE_VERSION_.length;
input.skipBytes(headersize);
if (authenticate(bigendian, charset, charsize, unicodeversion) &&
authenticate(dataformatid, dataformatversion))
return setUCharacterDB(data, unicodeversion);
}
return false;
}
/**
* Abstract method for verifying the file format version
* @param formatid file format identification
* @param formatversion file format version of input file to be verified
* @return true if the right file format version is used
*/
protected abstract boolean authenticate(byte formatid[],
byte formatversion[]);
/**
* Abstract method for getting the size of the file format version
* @return size of file format version in bytes
*/
protected abstract int getFileFormatVersionSize();
/**
* Abstract method for getting the size of the file format id
* @return size of file format id in bytes
*/
protected abstract int getFileFormatIDSize();
// private methods ====================================================
/**
* Checking the file against the magic numbers for authenticity
* @param m1 magic number 1
* @param m2 magic number 2
* @return true if the magic numbers are correct
*/
private boolean authenticate(byte m1, byte m2)
{
if (m1 == MAGIC1 && m2 == MAGIC2)
return true;
return false;
}
/**
* Checking the file for the correct format
* @param bigendian
* @param charset
* @param charsize
* @param dataformatid
* @param dataformatversion
* @param unicodeversion
* @return true if the file is in bigendian, charset , charsize == 2,
* dataformatid 85.80.114.111, dataformatversion dependent on file,
* and unicodeversion > 3.0.0.0
*/
private boolean authenticate(byte bigendian, byte charset, byte charsize,
byte unicodeversion[])
{
if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_ ||
charsize != CHAR_SIZE_)
return false;
int size = UNICODE_VERSION_.length;
for (int i = 0; i < size; i ++)
if (UNICODE_VERSION_[i] != unicodeversion[i])
return false;
return true;
}
/**
* Sets the relevant data into UCharacterDB
* @param data UCharacterDB instance to populate
* @param unicodeversion version number of the Unicode data information used
* @param formatversion icu version number of the uprops.dat and unames.dat
* used
* @return true if operation is successful, false otherwise
*/
private boolean setUCharacterDB(UCharacterDB data, byte[] unicodeversion)
{
boolean result = data.setUnicodeVersion(unicodeversion);
return result;
}
}