ICU-1763 First cut of Normalizer code port from ICU4C

X-SVN-Rev: 8907
This commit is contained in:
Ram Viswanadha 2002-06-20 01:21:18 +00:00
parent 7c37ae1353
commit c445874382
40 changed files with 25418 additions and 17631 deletions

1
.gitattributes vendored
View file

@ -72,7 +72,6 @@ icu4j/src/com/ibm/icu/dev/data/ThaiWordFreq.xls -text
icu4j/src/com/ibm/icu/dev/data/holidays_jp.ucs -text
icu4j/src/com/ibm/icu/dev/data/rbbi/english.dict -text
icu4j/src/com/ibm/icu/dev/data/thai6.ucs -text
icu4j/src/com/ibm/icu/dev/data/unicode/Draft-TestSuite.txt -text
icu4j/src/com/ibm/icu/impl/data/ICULocaleData.jar -text
icu4j/src/com/ibm/icu/impl/data/thai_dict -text
icu4j/src/com/ibm/icu/impl/data/ucadata.dat -text

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/Attic/UCharacterIteratorTest.java,v $
* $Date: 2002/04/03 00:00:00 $
* $Revision: 1.1 $
* $Date: 2002/06/20 01:16:00 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
@ -15,7 +15,7 @@ package com.ibm.icu.dev.test.lang;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.UCharacterIterator;
import com.ibm.icu.impl.UnicodeCharacterIterator;
import com.ibm.icu.text.UTF16;
/**
@ -41,10 +41,10 @@ public final class UCharacterIteratorTest extends TestFmwk
*/
public void TestClone()
{
UCharacterIterator iterator = new UCharacterIterator("testing");
UCharacterIterator cloned = (UCharacterIterator)iterator.clone();
UnicodeCharacterIterator iterator = new UnicodeCharacterIterator("testing");
UnicodeCharacterIterator cloned = (UnicodeCharacterIterator)iterator.clone();
char completed = 0;
while (completed != UCharacterIterator.DONE) {
while (completed != UnicodeCharacterIterator.DONE) {
completed = iterator.next();
if (completed != cloned.next()) {
errln("Cloned operation failed");
@ -57,9 +57,9 @@ public final class UCharacterIteratorTest extends TestFmwk
*/
public void TestIteration()
{
UCharacterIterator iterator = new UCharacterIterator(
UnicodeCharacterIterator iterator = new UnicodeCharacterIterator(
ITERATION_STRING_);
UCharacterIterator iterator2 = new UCharacterIterator(
UnicodeCharacterIterator iterator2 = new UnicodeCharacterIterator(
ITERATION_STRING_);
if (iterator.first() != ITERATION_STRING_.charAt(0)) {
errln("Iterator failed retrieving first character");
@ -75,12 +75,12 @@ public final class UCharacterIteratorTest extends TestFmwk
iterator2.setIndex(0);
iterator.setIndex(0);
int ch = 0;
while (ch != UCharacterIterator.DONE_CODEPOINT) {
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
int index = iterator2.getIndex();
ch = iterator2.nextCodePoint();
if (index != ITERATION_SUPPLEMENTARY_INDEX) {
if (ch != (int)iterator.next() &&
ch != UCharacterIterator.DONE_CODEPOINT) {
ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
errln("Error mismatch in next() and nextCodePoint()");
}
}
@ -94,12 +94,12 @@ public final class UCharacterIteratorTest extends TestFmwk
}
iterator.setIndex(ITERATION_STRING_.length());
iterator2.setIndex(ITERATION_STRING_.length());
while (ch != UCharacterIterator.DONE_CODEPOINT) {
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
int index = iterator2.getIndex();
ch = iterator2.previousCodePoint();
if (index != ITERATION_SUPPLEMENTARY_INDEX) {
if (ch != (int)iterator.previous() &&
ch != UCharacterIterator.DONE_CODEPOINT) {
ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
errln("Error mismatch in previous() and " +
"previousCodePoint()");
}

File diff suppressed because it is too large Load diff

View file

@ -1,9 +1,15 @@
/*
************************************************************************
* Copyright (c) 1997-2000, International Business Machines
* Corporation and others. All Rights Reserved.
************************************************************************
*/
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/ConformanceTest.java,v $
* $Date: 2002/06/20 01:16:24 $
* $Revision: 1.9 $
*
*****************************************************************************************
*/
package com.ibm.icu.dev.test.normalizer;
@ -13,6 +19,7 @@ import com.ibm.icu.dev.test.*;
import com.ibm.icu.lang.*;
import com.ibm.icu.text.*;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.impl.UCharacterProperty;
public class ConformanceTest extends TestFmwk {
@ -21,28 +28,28 @@ public class ConformanceTest extends TestFmwk {
public static void main(String[] args) throws Exception {
new ConformanceTest().run(args);
}
public ConformanceTest() {
// Doesn't matter what the string and mode are; we'll change
// them later as needed.
normalizer = new Normalizer("", Normalizer.COMPOSE);
normalizer = new Normalizer("", Normalizer.NFC);
}
/**
* Test the conformance of Normalizer to
* Test the conformance of NewNormalizer to
* http://www.unicode.org/unicode/reports/tr15/conformance/Draft-TestSuite.txt.
* This file must be located at the path specified as TEST_SUITE_FILE.
*/
public void TestConformance() {
BufferedReader input = null;
public void TestConformance() throws Exception{
BufferedReader input = null;
String line = null;
String[] fields = new String[5];
StringBuffer buf = new StringBuffer();
int passCount = 0;
int failCount = 0;
InputStream is = null;
try {
input = TestUtil.getDataReader("unicode/Draft-TestSuite.txt");
input = TestUtil.getDataReader("unicode/NormalizationTest.txt");
for (int count = 0;;++count) {
line = input.readLine();
if (line == null) break;
@ -52,7 +59,7 @@ public class ConformanceTest extends TestFmwk {
// 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
// Skip comments
if (line.charAt(0) == '#') continue;
if (line.charAt(0) == '#' || line.charAt(0)=='@') continue;
// Parse out the fields
hexsplit(line, ';', fields, buf);
@ -101,46 +108,139 @@ public class ConformanceTest extends TestFmwk {
* @param line the source line from the test suite file
* @return true if the test passes
*/
private boolean checkConformance(String[] field, String line) {
private boolean checkConformance(String[] field, String line) throws Exception{
boolean pass = true;
StringBuffer buf = new StringBuffer(); // scratch
String out;
for (int i=0; i<5; ++i) {
String out,fcd;
int i=0;
UTF16.StringComparator comp = new UTF16.StringComparator();
for (i=0; i<5; ++i) {
if (i<3) {
out = Normalizer.normalize(field[i], Normalizer.COMPOSE, 0);
out = Normalizer.normalize(field[i], Normalizer.NFC);
pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.COMPOSE, buf, +1);
out = iterativeNorm(field[i], Normalizer.NFC, buf, +1);
pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.COMPOSE, buf, -1);
out = iterativeNorm(field[i], Normalizer.NFC, buf, -1);
pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
out = Normalizer.normalize(field[i], Normalizer.DECOMP, 0);
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, +1);
pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, -1);
pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
out = Normalizer.normalize(field[i], Normalizer.NFD);
pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.DECOMP, buf, +1);
out = iterativeNorm(field[i], Normalizer.NFD, buf, +1);
pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.DECOMP, buf, -1);
out = iterativeNorm(field[i], Normalizer.NFD, buf, -1);
pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, +1);
pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, -1);
pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
}
out = Normalizer.normalize(field[i], Normalizer.COMPOSE_COMPAT, 0);
out = Normalizer.normalize(field[i], Normalizer.NFKC);
pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.COMPOSE_COMPAT, buf, +1);
out = iterativeNorm(field[i], Normalizer.NFKC, buf, +1);
pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.COMPOSE_COMPAT, buf, -1);
out = iterativeNorm(field[i], Normalizer.NFKC, buf, -1);
pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = Normalizer.normalize(field[i], Normalizer.DECOMP_COMPAT, 0);
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, +1);
pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, -1);
pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = Normalizer.normalize(field[i], Normalizer.NFKD);
pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.DECOMP_COMPAT, buf, +1);
out = iterativeNorm(field[i], Normalizer.NFKD, buf, +1);
pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.DECOMP_COMPAT, buf, -1);
out = iterativeNorm(field[i], Normalizer.NFKD, buf, -1);
pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, +1);
pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, -1);
pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
}
// test quick checks
if(Normalizer.NO == Normalizer.quickCheck(field[1], Normalizer.NFC)) {
errln("Normalizer error: quickCheck(NFC(s), NewNormalizer.NFC) is NewNormalizer.NO");
pass = false;
}
if(Normalizer.NO == Normalizer.quickCheck(field[2], Normalizer.NFD)) {
errln("Normalizer error: quickCheck(NFD(s), NewNormalizer.NFD) is NewNormalizer.NO");
pass = false;
}
if(Normalizer.NO == Normalizer.quickCheck(field[3], Normalizer.NFKC)) {
errln("Normalizer error: quickCheck(NFKC(s), NewNormalizer.NFKC) is NewNormalizer.NO");
pass = false;
}
if(Normalizer.NO == Normalizer.quickCheck(field[4], Normalizer.NFKD)) {
errln("Normalizer error: quickCheck(NFKD(s), NewNormalizer.NFKD) is NewNormalizer.NO");
pass = false;
}
if(!Normalizer.isNormalized(field[1], Normalizer.NFC)) {
errln("Normalizer error: isNormalized(NFC(s), NewNormalizer.NFC) is false");
pass = false;
}
if(!field[0].equals(field[1]) && Normalizer.isNormalized(field[0], Normalizer.NFC)) {
errln("Normalizer error: isNormalized(s, NewNormalizer.NFC) is TRUE");
pass = false;
}
if(!Normalizer.isNormalized(field[3], Normalizer.NFKC)) {
errln("Normalizer error: isNormalized(NFKC(s), NewNormalizer.NFKC) is false");
pass = false;
}
if(!field[0].equals(field[3]) && Normalizer.isNormalized(field[0], Normalizer.NFKC)) {
errln("Normalizer error: isNormalized(s, NewNormalizer.NFKC) is TRUE");
pass = false;
}
// test FCD quick check and "makeFCD"
fcd=Normalizer.normalize(field[0], Normalizer.FCD);
if(Normalizer.NO == Normalizer.quickCheck(fcd, Normalizer.FCD)) {
errln("Normalizer error: quickCheck(FCD(s), NewNormalizer.FCD) is NewNormalizer.NO");
pass = false;
}
if(Normalizer.NO == Normalizer.quickCheck(field[2], Normalizer.FCD)) {
errln("Normalizer error: quickCheck(NFD(s), NewNormalizer.FCD) is NewNormalizer.NO");
pass = false;
}
if(Normalizer.NO == Normalizer.quickCheck(field[4], Normalizer.FCD)) {
errln("Normalizer error: quickCheck(NFKD(s), NewNormalizer.FCD) is NewNormalizer.NO");
pass = false;
}
out=Normalizer.normalize(fcd, Normalizer.NFD);
if(!out.equals(field[2])) {
errln("Normalizer error: NFD(FCD(s))!=NFD(s)");
pass = false;
}
if (!pass) {
errln("FAIL: " + line);
}
}
return pass;
}
/**
* Do a normalization using the iterative API in the given direction.
@ -148,20 +248,48 @@ public class ConformanceTest extends TestFmwk {
* @param dir either +1 or -1
*/
private String iterativeNorm(String str, Normalizer.Mode mode,
StringBuffer buf, int dir) {
StringBuffer buf, int dir) throws Exception{
normalizer.setText(str);
normalizer.setMode(mode);
buf.setLength(0);
char ch;
int ch;
if (dir > 0) {
for (ch = normalizer.first(); ch != Normalizer.DONE;
ch = normalizer.next()) {
buf.append(ch);
buf.append(UTF16.toString(ch));
}
} else {
for (ch = normalizer.last(); ch != Normalizer.DONE;
ch = normalizer.previous()) {
buf.insert(0, ch);
buf.insert(0, UTF16.toString(ch));
}
}
return buf.toString();
}
/**
* Do a normalization using the iterative API in the given direction.
* @param str a Java StringCharacterIterator
* @param buf scratch buffer
* @param dir either +1 or -1
*/
private String iterativeNorm(StringCharacterIterator str, Normalizer.Mode mode,
StringBuffer buf, int dir) throws Exception{
normalizer.setText(str);
normalizer.setMode(mode);
buf.setLength(0);
int ch;
if (dir > 0) {
for (ch = normalizer.first(); ch != Normalizer.DONE;
ch = normalizer.next()) {
buf.append(UTF16.toString(ch));
}
} else {
for (ch = normalizer.last(); ch != Normalizer.DONE;
ch = normalizer.previous()) {
buf.insert(0, UTF16.toString(ch));
}
}
return buf.toString();
@ -180,8 +308,8 @@ public class ConformanceTest extends TestFmwk {
if (exp.equals(got)) {
return true;
}
errln(Utility.escape(" " + msg + ") " + op + "(" + s + ")=" + got +
", exp. " + exp));
errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got) +
", exp. " + hex(exp)));
return false;
}
@ -207,21 +335,26 @@ public class ConformanceTest extends TestFmwk {
}
// Our field is from pos..delim-1.
buf.setLength(0);
while (pos < delim) {
if (s.charAt(pos) == ' ') {
++pos;
} else if (pos+4 > delim) {
throw new IllegalArgumentException("Premature eol in " + s);
} else {
int hex = Integer.parseInt(s.substring(pos, pos+4), 16);
if (hex < 0 || hex > 0xFFFF) {
throw new IllegalArgumentException("Out of range hex " +
hex + " in " + s);
String toHex = s.substring(pos,delim);
pos = delim;
int index = 0;
int len = toHex.length();
while(index< len){
if(toHex.charAt(index)==' '){
index++;
}else{
int spacePos = toHex.indexOf(' ', index);
if(spacePos==-1){
appendInt(buf,toHex.substring(index,len),s);
spacePos = len;
}else{
appendInt(buf,toHex.substring(index, spacePos),s);
}
buf.append((char) hex);
pos += 4;
index = spacePos+1;
}
}
if (buf.length() < 1) {
throw new IllegalArgumentException("Empty field " + i + " in " + s);
}
@ -229,17 +362,29 @@ public class ConformanceTest extends TestFmwk {
++pos; // Skip over delim
}
}
public static void appendInt(StringBuffer buf, String strToHex, String s){
int hex = Integer.parseInt(strToHex,16);
if (hex < 0 ) {
throw new IllegalArgumentException("Out of range hex " +
hex + " in " + s);
}else if (hex > 0xFFFF){
buf.append((char)((hex>>10)+0xd7c0));
buf.append((char)((hex&0x3ff)|0xdc00));
}else{
buf.append((char) hex);
}
}
// Specific tests for debugging. These are generally failures
// taken from the conformance file, but culled out to make
// debugging easier. These can be eliminated without affecting
// coverage.
public void _hideTestCase6() {
public void _hideTestCase6() throws Exception{
_testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;");
}
public void _testOneLine(String line) {
public void _testOneLine(String line) throws Exception{
String[] fields = new String[5];
StringBuffer buf = new StringBuffer();
// Parse out the fields

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/Attic/ExhaustiveTest.java,v $
* $Date: 2002/03/01 18:48:01 $
* $Revision: 1.10 $
* $Date: 2002/06/20 01:16:24 $
* $Revision: 1.11 $
*
*****************************************************************************************
*/
@ -15,46 +15,38 @@ package com.ibm.icu.dev.test.normalizer;
import com.ibm.icu.dev.test.*;
import com.ibm.icu.lang.*;
import com.ibm.icu.text.*;
import com.ibm.icu.dev.tool.normalizer.UInfo;
import com.ibm.icu.impl.NormalizerImpl;
public class ExhaustiveTest extends TestFmwk
{
private UInfo info;
public static void main(String[] args) throws Exception
{
UInfo tempInfo = null;
String[] tempArgs = new String[args.length];
String[] tempArgs = new String[args.length];
int count = 0;
// Allow the test to be pointed at a specific version of the Unicode database
for (int i = 0; i < args.length; i++)
{
if (args[i].equals("-data")) {
tempInfo = new UInfo(args[++i], args[++i]);
} else {
tempArgs[count++] = args[i];
}
}
//for (int i = 0; i < args.length; i++)
//{
// if (args[i].equals("-data")) {
// tempInfo = new UInfo(args[++i], args[++i]);
// } else {
// tempArgs[count++] = args[i];
// }
//}
args = new String[count];
System.arraycopy(tempArgs, 0, args, 0, count);
if (tempInfo == null) {
tempInfo = new UInfo();
}
new ExhaustiveTest(tempInfo).run(args);
new ExhaustiveTest().run(args);
}
public ExhaustiveTest() {
this.info = new UInfo();
}
public ExhaustiveTest(UInfo info) {
this.info = info;
}
/**
* Run through all of the characters returned by a composed-char iterator
@ -89,7 +81,7 @@ public class ExhaustiveTest extends TestFmwk
// make sense
String chString = new StringBuffer().append(ch).toString();
String iterDecomp = iter.decomposition();
String normDecomp = Normalizer.decompose(chString, compat, 0);
String normDecomp = Normalizer.decompose(chString, compat);
if (iterDecomp.equals(chString)) {
errln("ERROR: " + hex(ch) + " has identical decomp");
@ -106,7 +98,7 @@ public class ExhaustiveTest extends TestFmwk
{
for (char x = ++start; x < limit; x++) {
String xString = new StringBuffer().append(x).toString();
String decomp = Normalizer.decompose(xString, compat, options);
String decomp = Normalizer.decompose(xString, compat);
if (!decomp.equals(xString)) {
errln("ERROR: " + hex(x) + " has decomposition (" + hex(decomp) + ")"
+ " but was not returned by iterator");
@ -124,26 +116,31 @@ public class ExhaustiveTest extends TestFmwk
char ch = iter.next();
String chStr = new StringBuffer().append(ch).toString();
String decomp = Normalizer.decompose(chStr, compat, options);
String comp = Normalizer.compose(decomp, compat, options);
String decomp = Normalizer.decompose(chStr, compat);
String comp = Normalizer.compose(decomp, compat);
short cClass = info.getCanonicalClass(decomp.charAt(0));
int cClass = UCharacter.getCombiningClass(decomp.charAt(0));
cClass = 0;
if (info.isExcludedComposition(ch)) {
logln("Skipped excluded char " + hex(ch) + " (" + info.getName(ch,true) + ")" );
if (NormalizerImpl.isFullCompositionExclusion(ch)) {
logln("Skipped excluded char " + hex(ch) + " (" + UCharacter.getName(ch) + ")" );
continue;
}
// Avoid disparaged characters
if (info.getDecomposition(ch).length() == 4) continue;
if (getDecomposition(ch,compat).length() == 4) continue;
if (!comp.equals(chStr)) {
errln("ERROR: Round trip invalid: " + hex(chStr) + " --> " + hex(decomp)
+ " --> " + hex(comp));
errln(" char decomp is '" + info.getDecomposition(ch) + "'");
errln(" char decomp is '" + getDecomposition(ch,compat) + "'");
}
}
}
private String getDecomposition(char ch, boolean compat){
char[] dest = new char[10];
int length = NormalizerImpl.getDecomposition(ch,compat,dest,0,dest.length);
return new String(dest,0,length);
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java,v $
* $Date: 2002/03/19 00:18:44 $
* $Revision: 1.7 $
* $Date: 2002/06/20 01:16:24 $
* $Revision: 1.8 $
*
*****************************************************************************************
*/
@ -39,6 +39,7 @@ public class TestCanonicalIterator extends TestFmwk {
{"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
};
public void TestExhaustive() {
int counter = 0;
int mixedCounter = 0;
@ -63,8 +64,8 @@ public class TestCanonicalIterator extends TestFmwk {
if ((++counter % 5000) == 0) logln("Testing " + Utility.hex(i,0));
String s = UTF16.valueOf(i) + "\u0345";
String decomp = Normalizer.decompose(s, false, 0);
String comp = Normalizer.compose(s, false, 0);
String decomp = Normalizer.decompose(s, false);
String comp = Normalizer.compose(s, false);
// skip characters that don't have either decomp.
// need quick test for this!
if (s.equals(decomp) && s.equals(comp)) continue;
@ -170,14 +171,17 @@ public class TestCanonicalIterator extends TestFmwk {
}
public void TestBasic() {
// check build
UnicodeSet ss = CanonicalIterator.getSafeStart();
logln("Safe Start: " + ss.toPattern(true));
ss = CanonicalIterator.getStarts('a');
expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
+ "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
);
// This is not interesting anymore as the data is already built
// beforehand
// check build
// UnicodeSet ss = CanonicalIterator.getSafeStart();
// logln("Safe Start: " + ss.toPattern(true));
// ss = CanonicalIterator.getStarts('a');
// expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
// new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
// + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
// );
// check permute
// NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!

View file

@ -98,14 +98,14 @@ public class RoundTripTest extends TestFmwk {
public void TestGreekUNGEGN() throws IOException, ParseException {
new Test("Latin-Greek/UNGEGN")
.test("[a-zA-Z]", "[\u003B\u00B7[:Greek:]-[\u03D7-\u03EF]]",
"[\u00B5\u037A\u03D0-\uFFFF]", /* roundtrip exclusions */
"[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */
this, new LegalGreek(false));
}
public void Testel() throws IOException, ParseException {
new Test("Latin-el")
.test("[a-zA-Z]", "[\u003B\u00B7[:Greek:]-[\u03D7-\u03EF]]",
"[\u00B5\u037A\u03D0-\uFFFF]", /* roundtrip exclusions */
"[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */
this, new LegalGreek(false));
}
@ -136,7 +136,7 @@ public class RoundTripTest extends TestFmwk {
String nukta = "\u093c\u09bc\u0a3c\u0abc\u0b3c";
String virama = "\u094d\u09cd\u0a4d\u0acd\u0b4d\u0bcd\u0c4d\u0ccd\u0d4d";
String sanskritStressSigns = "\u0951\u0952\u0953\u0954";
String chandrabindu = "\u0901\u0981\u0A81\u0b01";
String chandrabindu = "\u0901\u0981\u0A81\u0b01\u0c01";
public boolean is(String sourceString){
int cp=sourceString.charAt(0);
@ -221,7 +221,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "Tamil-DEVANAGARI",
"[:tamil:]", "[:Devanagari:]",
"[\u093c\u0943-\u094a\u0951-\u0954\u0962\u0963\u090B\u090C\u090D\u0911\u0916\u0917\u0918\u091B\u091D\u0920\u0921\u0922\u0925\u0926\u0927\u092B\u092C\u092D\u0936\u093d\u0950[\u0958-\u0961]]", /*roundtrip exclusions*/
"[\u0901\u093c\u0943-\u094a\u0951-\u0954\u0962\u0963\u090B\u090C\u090D\u0911\u0916\u0917\u0918\u091B\u091D\u0920\u0921\u0922\u0925\u0926\u0927\u092B\u092C\u092D\u0936\u093d\u0950[\u0958-\u0961]]", /*roundtrip exclusions*/
},
new String [] { "DEVANAGARI-Tamil",
"[:Devanagari:]", "[:tamil:]",
@ -239,7 +239,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "KANNADA-DEVANAGARI",
"[:KANNADA:]", "[:Devanagari:]",
"[\u0946\u093c\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
"[\u0901\u0946\u093c\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
},
new String [] { "DEVANAGARI-KANNADA",
"[:Devanagari:]", "[:KANNADA:]",
@ -248,7 +248,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "MALAYALAM-DEVANAGARI",
"[:MALAYALAM:]", "[:Devanagari:]",
"[\u094a\u094b\u094c\u093c\u0950\u0944\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
"[\u0901\u094a\u094b\u094c\u093c\u0950\u0944\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
},
new String [] { "DEVANAGARI-MALAYALAM",
"[:Devanagari:]", "[:MALAYALAM:]",
@ -284,7 +284,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "Tamil-BENGALI",
"[:tamil:]", "[:BENGALI:]",
"[\u09bc\u09c3\u09c4\u09e2\u09e3\u09f0\u09f1\u098B\u098C\u0996\u0997\u0998\u099B\u099D\u09A0\u09A1\u09A2\u09A5\u09A6\u09A7\u09AB\u09AC\u09AD\u09B6\u09DC\u09DD\u09DF\u09E0\u09E1]", /*roundtrip exclusions*/
"[\u0981\u09bc\u09c3\u09c4\u09e2\u09e3\u09f0\u09f1\u098B\u098C\u0996\u0997\u0998\u099B\u099D\u09A0\u09A1\u09A2\u09A5\u09A6\u09A7\u09AB\u09AC\u09AD\u09B6\u09DC\u09DD\u09DF\u09E0\u09E1]", /*roundtrip exclusions*/
},
new String [] { "BENGALI-Tamil",
"[:BENGALI:]", "[:tamil:]",
@ -302,7 +302,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "KANNADA-BENGALI",
"[:KANNADA:]", "[:BENGALI:]",
"[\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df]", /*roundtrip exclusions*/
"[\u0981\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df]", /*roundtrip exclusions*/
},
new String [] { "BENGALI-KANNADA",
"[:BENGALI:]", "[:KANNADA:]",
@ -311,7 +311,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "MALAYALAM-BENGALI",
"[:MALAYALAM:]", "[:BENGALI:]",
"[\u09e2\u09e3\u09bc\u09c4\u09f0\u09f1\u09dc\u09dd\u09df]", /*roundtrip exclusions*/
"[\u0981\u09e2\u09e3\u09bc\u09c4\u09f0\u09f1\u09dc\u09dd\u09df]", /*roundtrip exclusions*/
},
new String [] { "BENGALI-MALAYALAM",
"[:BENGALI:]", "[:MALAYALAM:]",
@ -382,7 +382,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "TAMIL-GUJARATI",
"[:TAMIL:]", "[:GUJARATI:]",
"[\u0abc\u0ac3\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8B\u0A8D\u0A91\u0A96\u0A97\u0A98\u0A9B\u0A9D\u0AA0\u0AA1\u0AA2\u0AA5\u0AA6\u0AA7\u0AAB\u0AAC\u0AAD\u0AB6\u0ABD\u0AD0\u0AE0]", /*roundtrip exclusions*/
"[\u0A81\u0abc\u0ac3\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8B\u0A8D\u0A91\u0A96\u0A97\u0A98\u0A9B\u0A9D\u0AA0\u0AA1\u0AA2\u0AA5\u0AA6\u0AA7\u0AAB\u0AAC\u0AAD\u0AB6\u0ABD\u0AD0\u0AE0]", /*roundtrip exclusions*/
},
new String [] { "GUJARATI-TAMIL",
"[:GUJARATI:]", "[:TAMIL:]",
@ -400,7 +400,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "KANNADA-GUJARATI",
"[:KANNADA:]", "[:GUJARATI:]",
"[\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
"[\u0A81\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
},
new String [] { "GUJARATI-KANNADA",
"[:GUJARATI:]", "[:KANNADA:]",
@ -409,7 +409,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "MALAYALAM-GUJARATI",
"[:MALAYALAM:]", "[:GUJARATI:]",
"[\u0ac4\u0acb\u0acc\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
"[\u0A81\u0ac4\u0acb\u0acc\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
},
new String [] { "GUJARATI-MALAYALAM",
"[:GUJARATI:]", "[:MALAYALAM:]",
@ -418,7 +418,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "TAMIL-ORIYA",
"[:TAMIL:]", "[:ORIYA:]",
"[\u0b3c\u0b43\u0b56\u0B0B\u0B0C\u0B16\u0B17\u0B18\u0B1B\u0B1D\u0B20\u0B21\u0B22\u0B25\u0B26\u0B27\u0B2B\u0B2C\u0B2D\u0B36\u0B3D\u0B5C\u0B5D\u0B5F\u0B60\u0B61]", /*roundtrip exclusions*/
"[\u0B01\u0b3c\u0b43\u0b56\u0B0B\u0B0C\u0B16\u0B17\u0B18\u0B1B\u0B1D\u0B20\u0B21\u0B22\u0B25\u0B26\u0B27\u0B2B\u0B2C\u0B2D\u0B36\u0B3D\u0B5C\u0B5D\u0B5F\u0B60\u0B61]", /*roundtrip exclusions*/
},
new String [] { "ORIYA-TAMIL",
"[:ORIYA:]", "[:TAMIL:]",
@ -436,7 +436,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "KANNADA-ORIYA",
"[:KANNADA:]", "[:ORIYA:]",
"[\u0b3c\u0b57\u0B3D\u0B5C\u0B5D\u0B5F]", /*roundtrip exclusions*/
"[\u0B01\u0b3c\u0b57\u0B3D\u0B5C\u0B5D\u0B5F]", /*roundtrip exclusions*/
},
new String [] { "ORIYA-KANNADA",
"[:ORIYA:]", "[:KANNADA:]",
@ -445,7 +445,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "MALAYALAM-ORIYA",
"[:MALAYALAM:]", "[:ORIYA:]",
"[\u0b3c\u0b56\u0B3D\u0B5C\u0B5D\u0B5F]", /*roundtrip exclusions*/
"[\u0B01\u0b3c\u0b56\u0B3D\u0B5C\u0B5D\u0B5F]", /*roundtrip exclusions*/
},
new String [] { "ORIYA-MALAYALAM",
"[:ORIYA:]", "[:MALAYALAM:]",
@ -458,7 +458,7 @@ public class RoundTripTest extends TestFmwk {
},
new String [] { "TAMIL-TELUGU",
"[:TAMIL:]", "[:TELUGU:]",
"[\u0c43\u0c44\u0c46\u0c47\u0c55\u0c56\u0c66\u0C0B\u0C0C\u0C16\u0C17\u0C18\u0C1B\u0C1D\u0C20\u0C21\u0C22\u0C25\u0C26\u0C27\u0C2B\u0C2C\u0C2D\u0C36\u0C60\u0C61]", /*roundtrip exclusions*/
"[\u0C01\u0c43\u0c44\u0c46\u0c47\u0c55\u0c56\u0c66\u0C0B\u0C0C\u0C16\u0C17\u0C18\u0C1B\u0C1D\u0C20\u0C21\u0C22\u0C25\u0C26\u0C27\u0C2B\u0C2C\u0C2D\u0C36\u0C60\u0C61]", /*roundtrip exclusions*/
},
new String [] { "KANNADA-TAMIL",
@ -481,7 +481,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "KANNADA-TELUGU",
"[:KANNADA:]", "[:TELUGU:]",
"[\u0c3f\u0c46\u0c48\u0c4a]", /*roundtrip exclusions*/
"[\u0C01\u0c3f\u0c46\u0c48\u0c4a]", /*roundtrip exclusions*/
},
new String [] { "TELUGU-KANNADA",
"[:TELUGU:]", "[:KANNADA:]",
@ -490,7 +490,7 @@ public class RoundTripTest extends TestFmwk {
new String [] { "MALAYALAM-TELUGU",
"[:MALAYALAM:]", "[:TELUGU:]",
"[\u0c44\u0c4a\u0c4c\u0c4b\u0c55\u0c56]", /*roundtrip exclusions*/
"[\u0C01\u0c44\u0c4a\u0c4c\u0c4b\u0c55\u0c56]", /*roundtrip exclusions*/
},
new String [] { "TELUGU-MALAYALAM",
"[:TELUGU:]", "[:MALAYALAM:]",
@ -566,7 +566,7 @@ public class RoundTripTest extends TestFmwk {
public boolean is(String sourceString) {
try {
int t;
String decomp = Normalizer.normalize(sourceString, Normalizer.DECOMP, 0);
String decomp = Normalizer.normalize(sourceString, Normalizer.NFD);
for (int i = 0; i < decomp.length(); ++i) { // don't worry about surrogates
switch (getType(decomp.charAt(i))) {
case 0:
@ -619,11 +619,11 @@ public class RoundTripTest extends TestFmwk {
public boolean is(String sourceString) {
try {
String decomp = Normalizer.normalize(sourceString, Normalizer.DECOMP, 0);
String decomp = Normalizer.normalize(sourceString, Normalizer.NFD);
// modern is simpler: don't care about anything but a grave
if (!full) {
if (sourceString.equals("\u039C\u03C0")) return false;
//if (sourceString.equals("\u039C\u03C0")) return false;
for (int i = 0; i < decomp.length(); ++i) {
char c = decomp.charAt(i);
// exclude all the accents
@ -714,8 +714,8 @@ public class RoundTripTest extends TestFmwk {
public static boolean isSame(String a, String b) {
if (a.equals(b)) return true;
if (a.equalsIgnoreCase(b) && isCamel(a)) return true;
a = Normalizer.normalize(a, Normalizer.DECOMP, 0);
b = Normalizer.normalize(b, Normalizer.DECOMP, 0);
a = Normalizer.normalize(a, Normalizer.NFD);
b = Normalizer.normalize(b, Normalizer.NFD);
if (a.equals(b)) return true;
if (a.equalsIgnoreCase(b) && isCamel(a)) return true;
return false;
@ -925,7 +925,7 @@ public class RoundTripTest extends TestFmwk {
String targ = sourceToTarget.transliterate(cs);
if (!toTarget.containsAll(targ)
|| badCharacters.containsSome(targ)) {
String targD = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
String targD = Normalizer.normalize(targ, Normalizer.NFD);
if (!toTarget.containsAll(targD)
|| badCharacters.containsSome(targD)) {
logWrongScript("Source-Target", cs, targ);
@ -934,7 +934,7 @@ public class RoundTripTest extends TestFmwk {
}
}
String cs2 = Normalizer.normalize(cs, Normalizer.DECOMP, 0);
String cs2 = Normalizer.normalize(cs, Normalizer.NFD);
String targ2 = sourceToTarget.transliterate(cs2);
if (!targ.equals(targ2)) {
logNotCanonical("Source-Target", cs, targ, cs2, targ2);
@ -978,14 +978,14 @@ public class RoundTripTest extends TestFmwk {
String targ = sourceToTarget.transliterate(cs);
if (!toTarget.containsAll(targ)
|| badCharacters.containsSome(targ)) {
String targD = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
String targD = Normalizer.normalize(targ, Normalizer.NFD);
if (!toTarget.containsAll(targD)
|| badCharacters.containsSome(targD)) {
logWrongScript("Source-Target", cs, targ);
continue;
}
}
String cs2 = Normalizer.normalize(cs, Normalizer.DECOMP, 0);
String cs2 = Normalizer.normalize(cs, Normalizer.NFD);
String targ2 = sourceToTarget.transliterate(cs2);
if (!targ.equals(targ2)) {
logNotCanonical("Source-Target", cs, targ, cs2, targ2);
@ -1005,28 +1005,36 @@ public class RoundTripTest extends TestFmwk {
usi.reset(targetRange);
while (usi.next()) {
int c = usi.codepoint;
String cs;
int c;
if(usi.codepoint == usi.IS_STRING){
cs = usi.string;
c = UTF16.charAt(cs,0);
}else{
c = usi.codepoint;
cs =UTF16.valueOf(c);
}
String cs = UTF16.valueOf(c);
String targ = targetToSource.transliterate(cs);
String reverse = sourceToTarget.transliterate(targ);
if (!toSource.containsAll(targ)
|| badCharacters.containsSome(targ)) {
String targD = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
String targD = Normalizer.normalize(targ, Normalizer.NFD);
if (!toSource.containsAll(targD)
|| badCharacters.containsSome(targD)) {
logWrongScript("Target-Source", cs, targ);
failTargSource.add(c);
failTargSource.add(cs);
continue;
}
}
if (!isSame(cs, reverse) && !roundtripExclusions.contains(c)) {
if (!isSame(cs, reverse) && !roundtripExclusions.contains(c)
&& !roundtripExclusions.contains(cs)) {
logRoundTripFailure(cs,targetToSource.getID(), targ,sourceToTarget.getID(), reverse);
failRound.add(c);
continue;
}
String targ2 = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
String targ2 = Normalizer.normalize(targ, Normalizer.NFD);
String reverse2 = sourceToTarget.transliterate(targ2);
if (!reverse.equals(reverse2)) {
logNotCanonical("Target-Source", targ, reverse, targ2, reverse2);
@ -1076,7 +1084,7 @@ public class RoundTripTest extends TestFmwk {
if (!toSource.containsAll(targ) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/
|| badCharacters.containsSome(targ)) {
String targD = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
String targD = Normalizer.normalize(targ, Normalizer.NFD);
if (!toSource.containsAll(targD) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/
|| badCharacters.containsSome(targD)) {
logWrongScript("Target-Source", cs, targ);
@ -1084,11 +1092,13 @@ public class RoundTripTest extends TestFmwk {
}
}
if (!isSame(cs, reverse) /*&& !failRound.contains(c) && !failRound.contains(d)*/
&& !roundtripExclusions.contains(c) && !roundtripExclusions.contains(d)) {
&& !roundtripExclusions.contains(c)
&& !roundtripExclusions.contains(d)
&& !roundtripExclusions.contains(cs)) {
logRoundTripFailure(cs,targetToSource.getID(), targ,sourceToTarget.getID(), reverse);
continue;
}
String targ2 = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
String targ2 = Normalizer.normalize(targ, Normalizer.NFD);
String reverse2 = sourceToTarget.transliterate(targ2);
if (!reverse.equals(reverse2)) {
logNotCanonical("Target-Source", targ, reverse, targ2, reverse2);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
* $Date: 2002/06/12 17:37:10 $
* $Revision: 1.106 $
* $Date: 2002/06/20 01:16:48 $
* $Revision: 1.107 $
*
*****************************************************************************************
*/
@ -2313,10 +2313,10 @@ public class TransliteratorTest extends TestFmwk {
// Automatic generation of targets, to make it simpler to add test cases (and more fail-safe)
if (testCases[i].length > 2) target = testCases[i][2];
else if (id.equalsIgnoreCase("NFD")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.DECOMP,0);
else if (id.equalsIgnoreCase("NFC")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.COMPOSE,0);
else if (id.equalsIgnoreCase("NFKD")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.DECOMP_COMPAT,0);
else if (id.equalsIgnoreCase("NFKC")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.COMPOSE_COMPAT,0);
else if (id.equalsIgnoreCase("NFD")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.NFD);
else if (id.equalsIgnoreCase("NFC")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.NFC);
else if (id.equalsIgnoreCase("NFKD")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.NFKD);
else if (id.equalsIgnoreCase("NFKC")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.NFKC);
else if (id.equalsIgnoreCase("Lower")) target = UCharacter.toLowerCase(Locale.US, source);
else if (id.equalsIgnoreCase("Upper")) target = UCharacter.toUpperCase(Locale.US, source);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/WriteCharts.java,v $
* $Date: 2002/03/13 19:52:34 $
* $Revision: 1.14 $
* $Date: 2002/06/20 01:16:48 $
* $Revision: 1.15 $
*
*****************************************************************************************
*/
@ -198,7 +198,7 @@ public class WriteCharts {
group |= 16;
}
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.DECOMP_COMPAT, 0))
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD))
+ "\u0000" + ss,
"<td class='s'>" + ss + "<br><tt>" + hex(ss)
+ "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
@ -262,7 +262,7 @@ public class WriteCharts {
group |= 16;
}
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0)) + ts,
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts,
"<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
+ "</tt></td><td class='r'>"
+ rt + "<br><tt>" + hex(rt) + "</tt></td>");

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/localeconverter/ConvertPOSIXLocale.java,v $
* $Date: 2002/02/16 03:05:27 $
* $Revision: 1.2 $
* $Date: 2002/06/20 01:17:11 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -223,11 +223,11 @@ public class ConvertPOSIXLocale {
process(args);
//{{INIT_CONTROLS
//}}
}
}
public void process(String args[]) {
short options = identifyOptions(args);
String enc="";
String enc=null;
if ((args.length < 2) || ((options & OPT_UNKNOWN) != 0)) {
printUsage();
} else {
@ -249,6 +249,9 @@ public class ConvertPOSIXLocale {
}
}
if(enc==null){
enc="Default";
}
if ((fileName == null) || (locale == null) || (options == 0)) {
printUsage();
} else {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/localeconverter/ICU2LocaleWriter.java,v $
* $Date: 2002/02/16 03:05:28 $
* $Revision: 1.2 $
* $Date: 2002/06/20 01:17:12 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -58,17 +58,20 @@ public class ICU2LocaleWriter extends LocaleWriter {
super.write(tag, o);
} else {
CollationItem[] items = (CollationItem[])o;
print("CollationElements");
println(" { ");
for (int i = 0; i < items.length; i++) {
if(items[i]!=null){
printString(items[i].toString());
if (items[i].comment != null) {
tabTo(30);
print("//");
println(items[i].comment);
if(items[0]!=null){
print("Sequence");
println(" { ");
for (int i = 0; i < items.length; i++) {
if(items[i]!=null){
printString(items[i].toString());
if (items[i].comment != null) {
tabTo(30);
print("//");
println(items[i].comment);
}
}
}
println("}");
}
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/localeconverter/ICULocaleWriter.java,v $
* $Date: 2002/02/16 03:05:28 $
* $Revision: 1.2 $
* $Date: 2002/06/20 01:17:12 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -38,17 +38,20 @@ public class ICULocaleWriter extends LocaleWriter {
super.write(tag, o);
} else {
CollationItem[] items = (CollationItem[])o;
print("CollationElements");
println(" { ");
for (int i = 0; i < items.length; i++) {
if(items[i]!=null){
printString(items[i].toString());
if (items[i].comment != null) {
tabTo(30);
print("//");
println(items[i].comment);
}
}
if(items[0]!=null){
print("Sequence");
println(" { ");
for (int i = 0; i < items.length; i++) {
if(items[i]!=null){
printString(items[i].toString());
if (items[i].comment != null) {
tabTo(30);
print("//");
println(items[i].comment);
}
}
}
println("}");
}
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/localeconverter/PosixCharMap.java,v $
* $Date: 2002/02/16 03:05:30 $
* $Revision: 1.2 $
* $Date: 2002/06/20 01:17:12 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -62,6 +62,139 @@ public class PosixCharMap {
encoding =enc;
load(new BufferedReader(new FileReader(file)));
}
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
static private final char[] UNESCAPE_MAP = {
/*" 0x22, 0x22 */
/*' 0x27, 0x27 */
/*? 0x3F, 0x3F */
/*\ 0x5C, 0x5C */
/*a*/ 0x61, 0x07,
/*b*/ 0x62, 0x08,
/*f*/ 0x66, 0x0c,
/*n*/ 0x6E, 0x0a,
/*r*/ 0x72, 0x0d,
/*t*/ 0x74, 0x09,
/*v*/ 0x76, 0x0b
};
/**
* Convert an escape to a 32-bit code point value. We attempt
* to parallel the icu4c unesacpeAt() function.
* @param offset16 an array containing offset to the character
* <em>after</em> the backslash. Upon return offset16[0] will
* be updated to point after the escape sequence.
* @return character value from 0 to 10FFFF, or -1 on error.
*/
public static int unescapeAt(String s, int[] offset16) {
int c;
int result = 0;
int n = 0;
int minDig = 0;
int maxDig = 0;
int bitsPerDigit = 4;
int dig;
int i;
/* Check that offset is in range */
int offset = offset16[0];
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
/* Fetch first UChar after '\\' */
c = UTF16.charAt(s, offset);
offset += UTF16.getCharCount(c);
/* Convert hexadecimal and octal escapes */
switch (c) {
case 'u':
minDig = maxDig = 4;
break;
case 'U':
minDig = maxDig = 8;
break;
case 'x':
minDig = 1;
maxDig = 2;
break;
default:
dig = UCharacter.digit(c, 8);
if (dig >= 0) {
minDig = 1;
maxDig = 3;
n = 1; /* Already have first octal digit */
bitsPerDigit = 3;
result = dig;
}
break;
}
if (minDig != 0) {
while (offset < length && n < maxDig) {
// TEMPORARY
// TODO: Restore the char32-based code when UCharacter.digit
// is working (Bug 66).
//c = UTF16.charAt(s, offset);
//dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
c = s.charAt(offset);
dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16);
if (dig < 0) {
break;
}
result = (result << bitsPerDigit) | dig;
//offset += UTF16.getCharCount(c);
++offset;
++n;
}
if (n < minDig) {
return -1;
}
offset16[0] = offset;
return result;
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
offset16[0] = offset;
return UNESCAPE_MAP[i+1];
} else if (c < UNESCAPE_MAP[i]) {
break;
}
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
offset16[0] = offset;
return c;
}
/**
* Convert all escapes in a given string using unescapeAt().
* @exception IllegalArgumentException if an invalid escape is
* seen.
*/
public static String unescape(String s) {
StringBuffer buf = new StringBuffer();
int[] pos = new int[1];
for (int i=0; i<s.length(); ) {
char c = s.charAt(i++);
if (c == '\\') {
pos[0] = i;
int e = unescapeAt(s, pos);
if (e < 0) {
throw new IllegalArgumentException("Invalid escape sequence " +
s.substring(i-1, Math.min(i+8, s.length())));
}
UTF16.append(buf, e);
i = pos[0];
} else {
buf.append(c);
}
}
return buf.toString();
}
public void load(Reader inputReader) throws IOException {
PosixCharMap oldMap = SymbolTransition.getCharMap();
SymbolTransition.setCharMap(null);
@ -104,14 +237,21 @@ public class PosixCharMap {
state = p.nextToken();
} while ((state != EOF) && !p.dataEquals("CHARMAP"));
p.accept(EOL);
if (state != EOF) {
if (state != EOF ) {
p = new Lex(states2, input);
state = p.nextToken();
while (state != EOF) {
while (state != EOF ) {
String key = p.getData();
if(p.dataEquals("ENDCHARMAP")){
break;
}
state = p.nextToken();
while (state == EOL) {
String data = p.getData();
if(p.dataEquals("ENDCHARMAP")){
break;
}
String data = unescape(p.getData());
data.trim();
if (data.startsWith("<U") || data.startsWith("#U")) {
String numData = data.substring(2,data.length()-1);
@ -154,8 +294,7 @@ public class PosixCharMap {
state = p.nextToken();
key=p.getData();
}
}
//state = p.nextToken();
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/translit/UnicodeSetClosure.java,v $
* $Date: 2002/02/25 22:43:59 $
* $Revision: 1.6 $
* $Date: 2002/06/20 01:17:39 $
* $Revision: 1.7 $
*
*****************************************************************************************
*/
@ -95,7 +95,7 @@ public class UnicodeSetClosure {
}
static final Normalizer.Mode[] testModes = {
Normalizer.NO_OP, Normalizer.DECOMP, Normalizer.COMPOSE, Normalizer.DECOMP_COMPAT, Normalizer.COMPOSE_COMPAT};
Normalizer.NONE, Normalizer.NFD, Normalizer.NFC, Normalizer.NFKD, Normalizer.NFKC};
static final String[] modeNames = {
"NoNF", "NFD", "NFC", "NFKD", "NFKC"};
@ -197,7 +197,7 @@ public class UnicodeSetClosure {
String source = UTF16.valueOf(cp);
String result = source;
if (lowerFirst) result = UCharacter.toLowerCase(Locale.US, result);
result = Normalizer.normalize(result, mode, 0);
result = Normalizer.normalize(result, mode);
if (lowerFirst) result = UCharacter.toLowerCase(Locale.US, result);
if (result.equals(source)) return null;
return result;

View file

@ -31,13 +31,13 @@ import java.io.*;
public class genIndexFilters {
public static void main(String[] args) throws IOException {
Normalizer.Mode m = Normalizer.NO_OP;
Normalizer.Mode m = Normalizer.NONE;
boolean lowerFirst = false;
if (args.length >= 2) {
if (args[1].equalsIgnoreCase("NFD")) {
m = Normalizer.DECOMP;
m = Normalizer.NFD;
} else if (args[1].equalsIgnoreCase("NFKD")) {
m = Normalizer.DECOMP_COMPAT;
m = Normalizer.NFKD;
} else {
usage();
}
@ -59,7 +59,7 @@ public class genIndexFilters {
Transliterator t = Transliterator.getInstance(ID);
// TransliteratorUtility gives us access to package private API
UnicodeSet sourceSet = TransliteratorUtility.getSourceSet(t);
if (m != Normalizer.NO_OP || lowerFirst) {
if (m != Normalizer.NONE || lowerFirst) {
UnicodeSetClosure.close(sourceSet, m, lowerFirst);
}
System.out.println(sourceSet.toPattern(true));

View file

@ -0,0 +1,157 @@
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/ICUCharacterIterator.java,v $
* $Date: 2002/06/20 01:18:07 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.text.CharacterIterator;
public class ICUCharacterIterator extends UCharacterIterator {
private CharacterIterator iterator;
/**
* Current index
*/
private int currentIndex;
/**
* length
*/
private int length;
/**
* cache of begin offset in character iterator
*/
private int beginIndex;
public ICUCharacterIterator(CharacterIterator iter){
if(iter==null){
throw new IllegalArgumentException();
}
iterator = iter;
currentIndex = 0;
beginIndex = iter.getBeginIndex();
length = iter.getEndIndex() - beginIndex;
}
/**
* @see UCharacterIterator#current()
*/
public int current() {
if (currentIndex < length) {
return iterator.setIndex(beginIndex + currentIndex);
}
return DONE;
}
/**
* @see UCharacterIterator#getLength()
*/
public int getLength() {
return length;
}
/**
* @see UCharacterIterator#getIndex()
*/
public int getIndex() {
return currentIndex;
}
/**
* @see UCharacterIterator#next()
*/
public int next() {
if(currentIndex < length){
return iterator.setIndex(beginIndex + currentIndex++);
}
return DONE;
}
/**
* @see UCharacterIterator#previous()
*/
public int previous() {
if(currentIndex>0){
return iterator.setIndex(beginIndex + --currentIndex);
}
return DONE;
}
/**
* @see UCharacterIterator#setIndex(int)
*/
public void setIndex(int index) {
if (index < 0 || index > length) {
throw new IndexOutOfBoundsException();
}
currentIndex = index;
}
/**
* @see UCharacterIterator#setToLimit()
*/
public void setToLimit() {
currentIndex = length;
}
/**
* @see UCharacterIterator#getText(char[])
*/
public int getText(char[] fillIn, int offset){
if(offset < 0 || offset + length > fillIn.length){
throw new IndexOutOfBoundsException(Integer.toString(length));
}
for (char ch = iterator.first(); ch != iterator.DONE; ch = iterator.next()) {
fillIn[offset++] = ch;
}
iterator.setIndex(beginIndex + currentIndex);
return length;
}
/**
* Creates a clone of this iterator. Clones the underlying character iterator.
* @see UCharacterIterator#clone()
*/
public Object clone(){
try {
ICUCharacterIterator result = (ICUCharacterIterator) super.clone();
result.iterator = (CharacterIterator)this.iterator.clone();
return result;
} catch (CloneNotSupportedException e) {
return null; // only invoked if bad underlying character iterator
}
}
/**
* @see UCharacterIterator#moveIndex()
*/
public int moveIndex(int index){
currentIndex += index;
if(currentIndex < 0) {
currentIndex = 0;
} else if(currentIndex > length) {
currentIndex = length;
}
return currentIndex;
}
/**
* @see UCharacterIterator#getCharacterIterator()
*/
public CharacterIterator getCharacterIterator(){
return (CharacterIterator)iterator.clone();
}
}

View file

@ -5,14 +5,15 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/NormalizerDataReader.java,v $
* $Date: 2002/03/28 01:50:59 $
* $Revision: 1.3 $
* $Date: 2002/06/20 01:18:07 $
* $Revision: 1.4 $
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.*;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.util.VersionInfo;
/**
* @version 1.0
* @author Ram Viswanadha
@ -288,8 +289,8 @@ final class NormalizerDataReader {
throws IOException{
if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_,
DATA_FORMAT_VERSION_, UNICODE_VERSION_);
ICUBinary.readHeader(inputStream, DATA_FORMAT_ID,
DATA_FORMAT_VERSION, UNICODE_VERSION);
if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
@ -299,61 +300,63 @@ final class NormalizerDataReader {
}
// protected methods -------------------------------------------------
protected int[] readIndexes(int length)throws IOException{
int[] indexes = new int[length];
//Read the indexes
for (int i = 0; i <length ; i++) {
indexes[i] = dataInputStream.readInt();
}
return indexes;
}
/**
* <p>Reads uprops.dat, parse it into blocks of data to be stored in
* NormalizerImpl.</P
* @param impl NormalizerImpl instance
* @param normBytes
* @param fcdBytes
* @param auxBytes
* @param extraData
* @param combiningTable
* @param canonStartSets
* @exception thrown when data reading fails
* @draft 2.1
*/
protected void read(NormalizerImpl impl)
throws IOException{
//Read the indexes
int[] indexes = new int[NormalizerImpl.INDEX_TOP];
for (int i = 0; i <indexes.length ; i++) {
indexes[i] = dataInputStream.readInt();
}
//Read the bytes that make up the normTrie
byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
char[] extraData, char[] combiningTable,
Object[] canonStartSets)
throws IOException{
//Read the bytes that make up the normTrie
dataInputStream.read(normBytes);
ByteArrayInputStream normTrieStream= new ByteArrayInputStream(normBytes);
//normTrieStream= new ByteArrayInputStream(normBytes);
//Read the extra data
int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
char[] extraData = new char[extraDataTop];
for(int i=0;i<extraDataTop;i++){
for(int i=0;i<extraData.length;i++){
extraData[i]=dataInputStream.readChar();
}
//Read the combining class table
int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
char[] combiningTable = new char[combiningTableTop];
for(int i=0; i<combiningTableTop; i++){
for(int i=0; i<combiningTable.length; i++){
combiningTable[i]=dataInputStream.readChar();
}
//Read the fcdTrie
byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
dataInputStream.read(fcdBytes);
ByteArrayInputStream fcdTrieStream= new ByteArrayInputStream(fcdBytes);
//Read the AuxTrie
byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
//Read the AuxTrie
dataInputStream.read(auxBytes);
ByteArrayInputStream auxTrieStream= new ByteArrayInputStream(auxBytes);
//Read the canonical start sets
Object[] canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
int[] canonStartSetsIndexes = new int[NormalizerImpl.SET_INDEX_TOP];
for(int i=0; i<canonStartSetsIndexes.length; i++){
for(int i=0; i<canonStartSetsIndexes.length; i++){
canonStartSetsIndexes[i]=dataInputStream.readChar();
}
char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH]-NormalizerImpl.SET_INDEX_TOP];
char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH]-NormalizerImpl.SET_INDEX_TOP];
for(int i=0; i<startSets.length; i++){
startSets[i]=dataInputStream.readChar();
}
@ -369,20 +372,11 @@ final class NormalizerDataReader {
canonStartSets[NormalizerImpl.CANON_SET_START_SETS_INDEX] = startSets;
canonStartSets[NormalizerImpl.CANON_SET_BMP_TABLE_INDEX ] = bmpTable;
canonStartSets[NormalizerImpl.CANON_SET_SUPP_TABLE_INDEX] = suppTable;
//Now set the tries
impl.normTrieImpl.normTrie = new IntTrie( normTrieStream,impl.normTrieImpl );
impl.fcdTrieImpl.fcdTrie = new CharTrie(fcdTrieStream,impl.fcdTrieImpl );
impl.auxTrieImpl.auxTrie = new CharTrie( auxTrieStream, impl.auxTrieImpl );
impl.indexes = indexes;
impl.extraData = extraData;
impl.combiningTable = combiningTable;
impl.isDataLoaded = true;
impl.canonStartSets = canonStartSets;
impl.isFormatVersion_2_1 = DATA_FORMAT_VERSION_[0]>2 || (DATA_FORMAT_VERSION_[0]==2 && DATA_FORMAT_VERSION_[1]>=1);
}
public byte[] getDataFormatVersion(){
return DATA_FORMAT_VERSION;
}
// private data members -------------------------------------------------
@ -396,13 +390,13 @@ final class NormalizerDataReader {
* No guarantees are made if a older version is used
* see store.c of gennorm for more information and values
*/
private static final byte DATA_FORMAT_ID_[] = {(byte)0x4E, (byte)0x6F,
private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
(byte)0x72, (byte)0x6D};
private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x2, (byte)0x1,
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x1,
(byte)0x5, (byte)0x2};
//TODO: Set the version info after the VersionInfo class is ported
private static final byte UNICODE_VERSION_[] = {(byte)0x3, (byte)0x1,
(byte)0x1, (byte)0x0};
private static final String UNICODE_VERSION_STRING_ = "3.1.1.0";
private static final byte UNICODE_VERSION[] = {(byte)0x3, (byte)0x2,
(byte)0x0, (byte)0x0};
private static final String UNICODE_VERSION_STRING = "3.2.0.0";
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,240 @@
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/ReplaceableCharacterIterator.java,v $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.ReplaceableString;
import com.ibm.icu.text.UTF16;
/**
* DLF docs must define behavior when Replaceable is mutated underneath
* the iterator.
*
* This and ICUCharacterIterator share some code, maybe they should share
* an implementation, or the common state and implementation should be
* moved up into UCharacterIterator.
*
* What are first, last, and getBeginIndex doing here?!?!?!
*/
public class ReplaceableCharacterIterator extends UCharacterIterator {
// public constructor ------------------------------------------------------
/**
* Public constructor
* @param replacable text which the iterator will be based on
*/
public ReplaceableCharacterIterator(Replaceable replaceable){
if(replaceable==null){
throw new IllegalArgumentException();
}
this.replaceable = replaceable;
this.currentIndex = 0;
this.length = replaceable.length();
}
/**
* Public constructor
* @param str text which the iterator will be based on
*/
public ReplaceableCharacterIterator(String str){
if(str==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(str);
this.currentIndex = 0;
this.length = replaceable.length();
}
/**
* Public constructor
* @param src an array of characters on which the iterator will be based
*/
public ReplaceableCharacterIterator(char[] src){
if(src==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(new String(src));
this.currentIndex = 0;
this.length = replaceable.length();
}
/**
* Public constructor
* @param buf buffer of text on which the iterator will be based
*/
public ReplaceableCharacterIterator(StringBuffer buf){
if(buf==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(buf);
this.currentIndex = 0;
this.length = replaceable.length();
}
// public methods ----------------------------------------------------------
/**
* Creates a copy of this iterator, does not clone the underlying
* <code>Replaceable</code>object
* @return copy of this iterator
*/
public Object clone(){
try {
return super.clone();
} catch (CloneNotSupportedException e) {
return null; // never invoked
}
}
/**
* Returns the current UTF16 character.
* @return current UTF16 character
*/
public int current(){
if (currentIndex < length) {
return replaceable.charAt(currentIndex);
}
return DONE;
}
/**
* Returns the current codepoint
* @return current codepoint
*/
public int currentCodePoint(){
// cannot use charAt due to it different
// behaviour when index is pointing at a
// trail surrogate, check for surrogates
int ch = current();
if(UTF16.isLeadSurrogate((char)ch)){
// advance the index to get the next code point
next();
// due to post increment semantics current() after next()
// actually returns the next char which is what we want
int ch2 = current();
// current should never change the current index so back off
previous();
if(UTF16.isTrailSurrogate((char)ch2)){
// we found a surrogate pair
return UCharacterProperty.getRawSupplementary(
(char)ch,(char)ch2
);
}
}
return ch;
}
/**
* Returns the start of the text.
* @return 0
*/
public int getBeginIndex(){
return 0;
}
/**
* Returns the length of the text
* @return length of the text
*/
public int getLength(){
return length;
}
/**
* Gets the current currentIndex in text.
* @return current currentIndex in text.
*/
public int getIndex(){
return currentIndex;
}
/**
* Returns next UTF16 character and increments the iterator's currentIndex by 1.
* If the resulting currentIndex is greater or equal to the text length, the
* currentIndex is reset to the text length and a value of DONECODEPOINT is
* returned.
* @return next UTF16 character in text or DONE if the new currentIndex is off the
* end of the text range.
*/
public int next(){
if (currentIndex < length) {
return replaceable.charAt(currentIndex++);
}
return DONE;
}
/**
* Returns previous UTF16 character and decrements the iterator's currentIndex by
* 1.
* If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a
* value of DONECODEPOINT is returned.
* @return next UTF16 character in text or DONE if the new currentIndex is off the
* start of the text range.
*/
public int previous(){
if (currentIndex > 0) {
return replaceable.charAt(--currentIndex);
}
return DONE;
}
/**
* <p>Sets the currentIndex to the specified currentIndex in the text and returns that
* single UTF16 character at currentIndex.
* This assumes the text is stored as 16-bit code units.</p>
* @param currentIndex the currentIndex within the text.
* @exception IllegalArgumentException is thrown if an invalid currentIndex is
* supplied. i.e. currentIndex is out of bounds.
* @return the character at the specified currentIndex or DONE if the specified
* currentIndex is equal to the end of the text.
*/
public void setIndex(int currentIndex) throws IndexOutOfBoundsException{
if (currentIndex < 0 || currentIndex > length) {
throw new IndexOutOfBoundsException();
}
this.currentIndex = currentIndex;
}
public int getText(char[] fillIn, int offset){
if(offset < 0 || offset + length > fillIn.length){
throw new IndexOutOfBoundsException(Integer.toString(length));
}
replaceable.getChars(0,length,fillIn,offset);
return length;
}
public String getString(){
char[] arr = new char[length];
replaceable.getChars(0,length,arr,0);
return new String(arr);
}
// private data members ----------------------------------------------------
/**
* Replacable object
*/
private Replaceable replaceable;
/**
* Current currentIndex
*/
private int currentIndex;
/**
* Replaceable text length
*/
private int length;
}

View file

@ -0,0 +1,91 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/UCharArrayIterator.java,v $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.impl;
/**
* @author Doug Felt
*
*/
public final class UCharArrayIterator extends UCharacterIterator {
private final char[] text;
private final int start;
private final int limit;
private int pos;
public UCharArrayIterator(char[] text, int start, int limit) {
if (start < 0 || limit > text.length || start > limit) {
throw new IllegalArgumentException("start: " + start + " or limit: "
+ limit + " out of range [0, "
+ text.length + ")");
}
this.text = text;
this.start = start;
this.limit = limit;
this.pos = start;
}
public int current() {
return pos < limit ? text[pos] : DONE;
}
public int getLength() {
return limit - start;
}
public int getIndex() {
return pos - start;
}
public int next() {
return pos < limit ? text[pos++] : DONE;
}
public int previous() {
return pos > start ? text[--pos] : DONE;
}
public void setIndex(int index) {
if (index < 0 || index > limit - start) {
throw new IndexOutOfBoundsException("index: " + index +
" out of range [0, "
+ (limit - start) + ")");
}
pos = start + index;
}
public int getText(char[] fillIn, int offset) {
int len = limit - start;
System.arraycopy(text, start, fillIn, offset, len);
return len;
}
public String getString() {
return new String(text, start, limit - start);
}
/**
* Creates a copy of this iterator, does not clone the underlying
* <code>Replaceable</code>object
* @return copy of this iterator
*/
public Object clone(){
try {
return super.clone();
} catch (CloneNotSupportedException e) {
return null; // never invoked
}
}
}

View file

@ -5,335 +5,399 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/UCharacterIterator.java,v $
* $Date: 2002/05/14 16:48:49 $
* $Revision: 1.5 $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.ReplaceableString;
import com.ibm.icu.text.StringCharacterIterator;
import com.ibm.icu.text.UTF16;
import java.text.CharacterIterator;
import com.ibm.icu.impl.UCharArrayIterator;
/**
* Internal class that iterates through a com.ibm.text.Replacable text object
* to return either Unicode characters.
* @author synwee
* @version release 2.1, February 2002
* DLF- Docs mostly need 1) much more description of iteration behavior,
* especially at endpoints and with empty or single character strings,
* and 2) need to describe the other major difference with Java
* CharacterIterator, which is that this also returns code points as
* well as code units.
*
* Don't understand why setIndex and moveIndex have different exception behavior.
* I expect they shouldn't.
*/
public final class UCharacterIterator implements CharacterIterator
{
// public data members -----------------------------------------------------
/**
* Indicator that we have reached the ends of the UTF16 text when returning
* 16 bit character.
*/
public static final int DONE = 0xFFFF;
/**
* Indicator that we have reached the ends of the UTF16 text when returning
* codepoints.
*/
public static final int DONE_CODEPOINT = -1;
// public constructor ------------------------------------------------------
/**
* Public constructor.
* By default the iteration range will be from 0 to the end of the text.
* @param replacable text which the iterator will be based on
*/
public UCharacterIterator(Replaceable replaceable)
{
m_replaceable_ = replaceable;
m_index_ = 0;
m_start_ = 0;
m_limit_ = replaceable.length();
}
/**
* Public constructor
* By default the iteration range will be from 0 to the end of the text.
* @param str text which the iterator will be based on
*/
public UCharacterIterator(String str)
{
m_replaceable_ = new ReplaceableString(str);
m_index_ = 0;
m_start_ = 0;
m_limit_ = m_replaceable_.length();
}
/**
* Constructs an iterator over the given range of the given string.
* @param text text to be iterated over
* @param start offset of the first character to iterate
* @param limit offset of the character following the last character to
* iterate
*/
public UCharacterIterator(String str, int start, int limit)
{
m_replaceable_ = new ReplaceableString(str);
m_start_ = start;
m_limit_ = limit;
m_index_ = m_start_;
}
/**
* Abstract class that defines an API for iteration on text objects.This is an
* interface for forward and backward iteration and random access into a text
* object. Forward iteration is done with post-increment and backward iteration
* is done with pre-decrement semantics, while the
* <code>java.text.CharacterIterator</code> interface methods provided forward
* iteration with "pre-increment" and backward iteration with pre-decrement
* semantics. This API is more efficient for forward iteration over code points.
* @author Ram
* @version release 2.2, May 2002
*/
public abstract class UCharacterIterator
implements Cloneable,UForwardCharacterIterator {
// static final methods ----------------------------------------------------
/**
* Constructs an iterator over the given range of the given replaceable
* string.
* @param text text to be iterated over
* @param start offset of the first character to iterate
* @param limit offset of the character following the last character to
* iterate
* Returns a <code>UCharacterIterator</code> object given a
* <code>Replaceable</code> object.
* @param source a valid source as a <code>Replaceable</code> object
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
*/
public UCharacterIterator(Replaceable replaceable, int start, int limit)
{
m_replaceable_ = replaceable;
m_start_ = start;
m_limit_ = limit;
m_index_ = m_start_;
}
// public methods ----------------------------------------------------------
/**
* Creates a copy of this iterator.
* Cloning will not duplicate a new Replaceable object.
* @return copy of this iterator
*/
public Object clone()
{
try {
return super.clone();
}
catch (CloneNotSupportedException e) {
throw new InternalError(
"Cloning by the super class java.text.CharacterIterator is not " +
"supported");
}
}
/**
* Returns the current UTF16 character.
* @return current UTF16 character
*/
public char current()
{
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
return m_replaceable_.charAt(m_index_);
}
return DONE;
public static final UCharacterIterator getInstance(Replaceable source){
return new ReplaceableCharacterIterator(source);
}
/**
* Returns the current codepoint
* @return current codepoint
* Returns a <code>UCharacterIterator</code> object given a
* source string.
* @param source a string
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
*/
public int currentCodePoint()
{
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
return m_replaceable_.char32At(m_index_);
}
return DONE_CODEPOINT;
public static final UCharacterIterator getInstance(String source){
return new ReplaceableCharacterIterator(source);
}
/**
* Gets the first UTF16 character in text.
* @return the first UTF16 in text.
* Returns a <code>UCharacterIterator</code> object given a
* source character array.
* @param source an array of UTF-16 code units
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
*/
public char first()
{
m_index_ = m_start_;
return current();
public static final UCharacterIterator getInstance(char[] source){
return getInstance(source,0,source.length);
}
/**
* Returns the start of the text to iterate.
* @return by default this method will return 0, unless a range for
* iteration had been specified during construction.
* Returns a <code>UCharacterIterator</code> object given a
* source character array.
* @param source an array of UTF-16 code units
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
*/
public int getBeginIndex()
{
return m_start_;
public static final UCharacterIterator getInstance(char[] source, int start, int limit){
return new UCharArrayIterator(source,start,limit);
}
/**
* Returns a <code>UCharacterIterator</code> object given a
* source StringBuffer.
* @param source an string buffer of UTF-16 code units
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
*/
public static final UCharacterIterator getInstance(StringBuffer source){
return new ReplaceableCharacterIterator(source);
}
/**
* Returns the limit offset of the text to iterate
* @return by default this method returns the length of the text, unless a
* range for iteration had been specified during construction.
*/
public int getEndIndex()
{
return m_limit_;
* Returns a <code>UCharacterIterator</code> object given a
* CharacterIterator.
* @param source a valid CharacterIterator object.
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
*/
public static final UCharacterIterator getInstance(CharacterIterator source){
return new ICUCharacterIterator(source);
}
// public methods ----------------------------------------------------------
/**
* Returns a <code>java.text.CharacterIterator</code> object for
* the underlying text of this iterator. The returned iterator is
* independent of this iterator.
* @return java.text.CharacterIterator object
*/
public CharacterIterator getCharacterIterator(){
return new StringCharacterIterator(this.getText());
}
/**
* Returns the code unit at the current index. If index is out
* of range, returns DONE. Index is not changed.
* @return current code unit
*/
public abstract int current();
/**
* Returns the codepoint at the current index.
* If the current index is invalid, DONE is returned.
* If the current index points to a lead surrogate, and there is a following
* trail surrogate, then the code point is returned. Otherwise, the code
* unit at index is returned. Index is not changed.
* @return current codepoint
*/
public int currentCodePoint(){
int ch = current();
if(UTF16.isLeadSurrogate((char)ch)){
// advance the index to get the
// next code point
next();
// due to post increment semantics
// current() after next() actually
// returns the char we want
int ch2 = current();
// current should never change
// the current index so back off
previous();
if(UTF16.isTrailSurrogate((char)ch2)){
// we found a surrogate pair
// return the codepoint
return UCharacterProperty.getRawSupplementary(
(char)ch,(char)ch2
);
}
}
return ch;
}
/**
* Returns the length of the text
* @return length of the text
*/
public abstract int getLength();
/**
* Gets the current index in text.
* @return current index in text.
*/
public int getIndex()
{
return m_index_;
public abstract int getIndex();
/**
* Returns the UTF16 code unit at index, and increments to the next
* code unit (post-increment semantics). If index is out of
* range, DONE is returned, and the iterator is reset to the limit
* of the text.
* @return the next UTF16 code unit, or DONE if the index is at the limit
* of the text.
*/
public abstract int next();
/**
* Returns the code point at index, and increments to the next code
* point (post-increment semantics). If index does not point to a
* valid surrogate pair, the behavior is the same as
* <code>next()<code>. Otherwise the iterator is incremented past
* the surrogate pair, and the code point represented by the pair
* is returned.
* @return the next codepoint in text, or DONE if the index is at
* the limit of the text.
*/
public int nextCodePoint(){
int ch1 = next();
if(UTF16.isLeadSurrogate((char)ch1)){
int ch2 = next();
if(UTF16.isTrailSurrogate((char)ch2)){
return UCharacterProperty.getRawSupplementary((char)ch1,
(char)ch2);
}else{
// unmatched surrogate so back out
previous();
}
}
return ch1;
}
/**
* Decrement to the position of the previous code unit in the
* text, and return it (pre-decrement semantics). If the
* resulting index is less than 0, the index is reset to 0 and
* DONE is returned.
* @return the previous code unit in the text, or DONE if the new
* index is before the start of the text.
*/
public abstract int previous();
/**
* Retreat to the start of the previous code point in the text,
* and return it (pre-decrement semantics). If the index is not
* preceeded by a valid surrogate pair, the behavior is the same
* as <code>previous()</code>. Otherwise the iterator is
* decremented to the start of the surrogate pair, and the code
* point represented by the pair is returned.
* @return the previous code point in the text, or DONE if the new
* index is before the start of the text.
*/
public int previousCodePoint(){
int ch1 = previous();
if(UTF16.isTrailSurrogate((char)ch1)){
int ch2 = previous();
if(UTF16.isLeadSurrogate((char)ch2)){
return UCharacterProperty.getRawSupplementary((char)ch2,
(char)ch1);
}else{
//unmatched trail surrogate so back out
next();
}
}
return ch1;
}
/**
* Sets the index to the specified index in the text.
* @param index the index within the text.
* @exception IndexOutOfBoundsException is thrown if an invalid index is
* supplied
*/
public abstract void setIndex(int index);
/**
* Sets the current index to the limit.
*/
public void setToLimit() {
setIndex(getLength());
}
/**
* Gets the last UTF16 iterateable character from the text and shifts the
* index to the end of the text accordingly.
* @return the last UTF16 iterateable character
* Sets the current index to the start.
*/
public char last()
{
if (m_limit_ != m_start_) {
m_index_ = m_limit_ - 1;
return m_replaceable_.charAt(m_index_);
}
m_index_ = m_limit_;
return DONE;
public void setToStart() {
setIndex(0);
}
/**
* Returns next UTF16 character and increments the iterator's index by 1.
* If the resulting index is greater or equal to the iteration limit, the
* index is reset to the text iteration limit and a value of DONE_CODEPOINT is
* returned.
* @return next UTF16 character in text or DONE if the new index is off the
* end of the text iteration limit.
*/
public char next()
{
if (m_index_ < m_limit_) {
char result = m_replaceable_.charAt(m_index_);
m_index_ ++;
return result;
}
return DONE;
}
/**
* Returns next codepoint after current index and increments the iterator's
* index by a number depending on the returned codepoint.
* This assumes the text is stored as 16-bit code units
* with surrogate pairs intermixed. If the index of a leading or trailing
* code unit of a surrogate pair is given, return the code point after the
* surrogate pair.
* If the resulting index is greater or equal to the text iterateable limit,
* the current index is reset to the text iterateable limit and a value of
* DONE_CODEPOINT is returned.
* @return next codepoint in text or DONE_CODEPOINT if the new index is off the
* end of the text iterateable limit.
*/
public int nextCodePoint()
{
if (m_index_ < m_limit_) {
char ch = m_replaceable_.charAt(m_index_);
m_index_ ++;
if (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
ch <= UTF16.LEAD_SURROGATE_MAX_VALUE &&
m_index_ < m_limit_) {
char trail = m_replaceable_.charAt(m_index_);
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
m_index_ ++;
return UCharacterProperty.getRawSupplementary(ch,
trail);
}
}
return ch;
}
return DONE_CODEPOINT;
}
/**
* Returns previous UTF16 character and decrements the iterator's index by
* 1.
* If the resulting index is less than the text iterateable limit, the
* index is reset to the start of the text iteration and a value of
* DONE_CODEPOINT is returned.
* @return next UTF16 character in text or DONE if the new index is off the
* start of the text iteration range.
* Fills the buffer with the underlying text storage of the iterator
* If the buffer capacity is not enough a exception is thrown. The capacity
* of the fill in buffer should at least be equal to length of text in the
* iterator obtained by calling <code>getLength()</code).
* <b>Usage:</b>
*
* <code>
* <pre>
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
* char[] buf = new char[iter.getLength()];
* iter.getText(buf);
*
* OR
* char[] buf= new char[1];
* int len = 0;
* for(;;){
* try{
* len = iter.getText(buf);
* break;
* }catch(IndexOutOfBoundsException e){
* buf = new char[iter.getLength()];
* }
* }
* </pre>
* </code>
*
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @param offset the position within the array to start putting the data.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBounds exception if there is not enough
* room after offset in the array, or if offset < 0.
*/
public char previous()
{
if (m_index_ > m_start_) {
m_index_ --;
return m_replaceable_.charAt(m_index_);
}
return DONE;
}
/**
* Returns previous codepoint before current index and decrements the
* iterator's index by a number depending on the returned codepoint.
* This assumes the text is stored as 16-bit code units
* with surrogate pairs intermixed. If the index of a leading or trailing
* code unit of a surrogate pair is given, return the code point before the
* surrogate pair.
* If the resulting index is less than the text iterateable range, the
* current index is reset to the start of the range and a value of
* DONE_CODEPOINT is returned.
* @return previous codepoint in text or DONE_CODEPOINT if the new index is
* off the start of the text iteration range.
*/
public int previousCodePoint()
{
if (m_index_ > m_start_) {
m_index_ --;
char ch = m_replaceable_.charAt(m_index_);
if (ch >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
ch <= UTF16.TRAIL_SURROGATE_MAX_VALUE &&
m_index_ > m_start_) {
char lead = m_replaceable_.charAt(m_index_);
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
m_index_ --;
return UCharacterProperty.getRawSupplementary(ch,
lead);
}
}
return ch;
}
return DONE_CODEPOINT;
}
/**
* <p>Sets the index to the specified index in the text and returns that
* single UTF16 character at index.
* This assumes the text is stored as 16-bit code units.</p>
* @param index the index within the text.
* @exception IllegalArgumentException is thrown if an invalid index is
* supplied. i.e. index is out of bounds.
* @return the character at the specified index or DONE if the specified
* index is equal to the limit of the text iteration range.
*/
public char setIndex(int index)
{
if (index < m_start_ || index > m_limit_) {
throw new IllegalArgumentException("Index index out of bounds");
public int getText(char[] fillIn, int offset) {
int len = getLength();
if (offset < 0 || offset + len > fillIn.length) {
throw new IndexOutOfBoundsException(Integer.toString(offset));
}
m_index_ = index;
return current();
}
// private data members ----------------------------------------------------
/**
* Replacable object
*/
private Replaceable m_replaceable_;
/**
* Current index
*/
private int m_index_;
/**
* Start offset of iterateable range, by default this is 0
*/
private int m_start_;
/**
* Limit offset of iterateable range, by default this is the length of the
* string
*/
private int m_limit_;
int index = getIndex();
setToStart();
int ch;
while ((ch = next())!= DONE) {
fillIn[offset++] = (char)ch;
}
setIndex(index);
return len;
}
/**
* Convenience override for <code>getText(char[], int)>/code> that provides
* an offset of 0.
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBounds exception if there is not enough
* room in the array.
*/
public final int getText(char[] fillIn) {
return getText(fillIn, 0);
}
/**
* Convenience method for returning the underlying text storage as as string
* @return the underlying text storage in the iterator as a string
*/
public String getText() {
char[] text = new char[getLength()];
getText(text);
return new String(text);
}
/**
* Moves the current position by the number of code units
* specified, either forward or backward depending on the sign
* of delta (positive or negative respectively). If the resulting
* index would be less than zero, the index is set to zero, and if
* the resulting index would be greater than limit, the index is
* set to limit.
*
* @param delta the number of code units to move the current
* index.
* @return the new index.
* @exception IndexOutOfBoundsException is thrown if an invalid index is
* supplied
*
*/
public int moveIndex(int delta) {
int x = Math.max(0, Math.min(getIndex() + delta, getLength()));
setIndex(x);
return x;
}
/**
* Moves the current position by the number of code points
* specified, either forward or backward depending on the sign of
* delta (positive or negative respectively). If the current index
* is at a trail surrogate then the first adjustment is by code
* unit, and the remaining adjustments are by code points. If the
* resulting index would be less than zero, the index is set to
* zero, and if the resulting index would be greater than limit,
* the index is set to limit.
* @param delta the number of code units to move the current index.
* @return the new index
* @exception IndexOutOfBoundsException is thrown if an invalid delta is
* supplied
*/
public int moveCodePointIndex(int delta){
if(delta>0){
while(delta-->0 && nextCodePoint() != DONE);
}else{
while(delta++<0 && previousCodePoint() != DONE);
}
if(delta!=0){
throw new IndexOutOfBoundsException();
}
return getIndex();
}
/**
* Creates a copy of this iterator, independent from other iterators.
* If it is not possible to clone the iterator, returns null.
* @return copy of this iterator
*/
public Object clone() throws CloneNotSupportedException{
return super.clone();
}
}

View file

@ -6,8 +6,8 @@
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterPropertyDB.java $
* $Date: 2002/04/04 00:52:27 $
* $Revision: 1.8 $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -760,7 +760,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
* @return size of the lower case character in UTF16 format
*/
public int getSpecialLowerCase(Locale locale, int index, int ch,
UCharacterIterator uchariter,
UnicodeCharacterIterator uchariter,
StringBuffer buffer)
{
int exception = getException(index,
@ -874,7 +874,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
* @return size of the lowercased codepoint in UTF16 format
*/
public int toLowerCase(Locale locale, int ch,
UCharacterIterator uchariter,
UnicodeCharacterIterator uchariter,
StringBuffer buffer)
{
int props = getProperty(ch);
@ -909,7 +909,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
* @return size oflowercased codepoint in UTF16 format
*/
public int toLowerCase(Locale locale, int ch,
UCharacterIterator uchariter, char buffer[])
UnicodeCharacterIterator uchariter, char buffer[])
{
int props = getProperty(ch);
if (!UCharacterProperty.isExceptionIndicator(props)) {
@ -953,7 +953,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
public void toLowerCase(Locale locale, String str, int start, int limit,
StringBuffer result)
{
UCharacterIterator ucharIter = new UCharacterIterator(str);
UnicodeCharacterIterator ucharIter = new UnicodeCharacterIterator(str);
int strIndex = start;
while (strIndex < limit) {
@ -980,7 +980,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
* @return size of uppercased codepoint in UTF16 format
*/
public int getSpecialUpperOrTitleCase(Locale locale, int index, int ch,
UCharacterIterator uchariter,
UnicodeCharacterIterator uchariter,
boolean upperflag,
StringBuffer buffer)
{
@ -1041,7 +1041,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
* @return size of uppercased codepoint in UTF16 format
*/
public int toUpperOrTitleCase(Locale locale, int ch,
UCharacterIterator uchariter,
UnicodeCharacterIterator uchariter,
boolean upperflag, StringBuffer buffer)
{
int props = getProperty(ch);
@ -1083,7 +1083,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
* @return size of uppercased codepoint in UTF16 format
*/
public int toUpperOrTitleCase(Locale locale, int ch,
UCharacterIterator uchariter,
UnicodeCharacterIterator uchariter,
boolean upperflag, char buffer[])
{
int props = getProperty(ch);
@ -1133,7 +1133,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
*/
public String toUpperCase(Locale locale, String str, int start, int limit)
{
UCharacterIterator ucharIter = new UCharacterIterator(str);
UnicodeCharacterIterator ucharIter = new UnicodeCharacterIterator(str);
int strIndex = start;
StringBuffer result = new StringBuffer(limit - start);
@ -1170,7 +1170,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
public String toTitleCase(Locale locale, String str,
BreakIterator breakiter)
{
UCharacterIterator ucharIter = new UCharacterIterator(str);
UnicodeCharacterIterator ucharIter = new UnicodeCharacterIterator(str);
int length = str.length();
StringBuffer result = new StringBuffer();
@ -1583,13 +1583,13 @@ public final class UCharacterProperty implements Trie.DataManipulate
* the set { 'i', 'j', U+012f, U+1e2d, U+1ecb }
* @see SpecialCasing.txt
*/
private static boolean isAFTER_i(UCharacterIterator uchariter, int offset)
private static boolean isAFTER_i(UnicodeCharacterIterator uchariter, int offset)
{
uchariter.setIndex(offset);
int ch = uchariter.previousCodePoint();
while (ch != UCharacterIterator.DONE_CODEPOINT) {
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
if (ch == LATIN_SMALL_LETTER_I_ || ch == LATIN_SMALL_LETTER_J_ ||
ch == LATIN_SMALL_LETTER_I_WITH_OGONEK_ ||
ch == LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ ||
@ -1618,13 +1618,13 @@ public final class UCharacterProperty implements Trie.DataManipulate
* character 'I' with no intervening combining class = 230
* @see SpecialCasing.txt
*/
private static boolean isAFTER_I(UCharacterIterator uchariter, int offset)
private static boolean isAFTER_I(UnicodeCharacterIterator uchariter, int offset)
{
uchariter.setIndex(offset);
int ch = uchariter.previousCodePoint();
while (ch != UCharacterIterator.DONE_CODEPOINT) {
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
if (ch == LATIN_CAPITAL_LETTER_I_) {
return true; // preceded by I
}
@ -1650,14 +1650,14 @@ public final class UCharacterProperty implements Trie.DataManipulate
* @return false if any character after offset in src is a cased letter
* @see SpecialCasing.txt
*/
private boolean isCFINAL(UCharacterIterator uchariter, int offset)
private boolean isCFINAL(UnicodeCharacterIterator uchariter, int offset)
{
// iterator should have been determined to be not null by caller
uchariter.setIndex(offset);
uchariter.nextCodePoint(); // rid of current codepoint
int ch = uchariter.nextCodePoint(); // start checking
while (ch != UCharacterIterator.DONE_CODEPOINT) {
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
int cat = getType(ch);
if (cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.UPPERCASE_LETTER ||
@ -1681,13 +1681,13 @@ public final class UCharacterProperty implements Trie.DataManipulate
* @return true if any character before index in src is a cased letter
* @see SpecialCasing.txt
*/
private boolean isNotCINITIAL(UCharacterIterator uchariter,
private boolean isNotCINITIAL(UnicodeCharacterIterator uchariter,
int offset)
{
uchariter.setIndex(offset);
int ch = uchariter.previousCodePoint();
while (ch != UCharacterIterator.DONE_CODEPOINT) {
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
int cat = getType(ch);
if (cat == UCharacterCategory.LOWERCASE_LETTER ||
cat == UCharacterCategory.UPPERCASE_LETTER ||
@ -1712,14 +1712,14 @@ public final class UCharacterProperty implements Trie.DataManipulate
* of combining class = 230.
* @see SpecialCasing.txt
*/
private static boolean isFollowedByMOREABOVE(UCharacterIterator uchariter,
private static boolean isFollowedByMOREABOVE(UnicodeCharacterIterator uchariter,
int offset)
{
uchariter.setIndex(offset);
uchariter.nextCodePoint(); // rid of current codepoint
int ch = uchariter.nextCodePoint(); // start checking
while (ch != UCharacterIterator.DONE_CODEPOINT) {
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
int cc = NormalizerImpl.getCombiningClass(ch);
if (cc == COMBINING_MARK_ABOVE_CLASS_) {
return true; // at least one cc==230 following
@ -1742,14 +1742,14 @@ public final class UCharacterProperty implements Trie.DataManipulate
* with no characters of combining class == 230 in between
* @see SpecialCasing.txt
*/
private static boolean isFollowedByDotAbove(UCharacterIterator uchariter,
private static boolean isFollowedByDotAbove(UnicodeCharacterIterator uchariter,
int offset)
{
uchariter.setIndex(offset);
uchariter.nextCodePoint(); // rid off current character
int ch = uchariter.nextCodePoint(); // start checking
while (ch != UCharacterIterator.DONE_CODEPOINT) {
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
if (ch == COMBINING_DOT_ABOVE_) {
return true;
}

View file

@ -0,0 +1,93 @@
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/UForwardCharacterIterator.java,v $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.1 $
*
*****************************************************************************************
*/
package com.ibm.icu.impl;
/**
* Interface that defines an API for forward-only iteration
* on text objects.
* This is a minimal interface for iteration without random access
* or backwards iteration. It is especially useful for wrapping
* streams with converters into an object for collation or
* normalization.
*
* <p>Characters can be accessed in two ways: as code units or as
* code points.
* Unicode code points are 21-bit integers and are the scalar values
* of Unicode characters. ICU uses the type <code>int</code> for them.
* Unicode code units are the storage units of a given
* Unicode/UCS Transformation Format (a character encoding scheme).
* With UTF-16, all code points can be represented with either one
* or two code units ("surrogates").
* String storage is typically based on code units, while properties
* of characters are typically determined using code point values.
* Some processes may be designed to work with sequences of code units,
* or it may be known that all characters that are important to an
* algorithm can be represented with single code units.
* Other processes will need to use the code point access functions.</p>
*
* <p>ForwardCharacterIterator provides next() to access
* a code unit and advance an internal position into the text object,
* similar to a <code>return text[position++]</code>.<br>
* It provides nextCodePoint() to access a code point and advance an internal
* position.</p>
*
* <p>nextCodePoint() assumes that the current position is that of
* the beginning of a code point, i.e., of its first code unit.
* After nextCodePoint(), this will be true again.
* In general, access to code units and code points in the same
* iteration loop should not be mixed. In UTF-16, if the current position
* is on a second code unit (Low Surrogate), then only that code unit
* is returned even by nextCodePoint().</p>
*
* Usage:
* <code>
* public void function1(UForwardCharacterIterator it) {
* int c;
* while((c=it.next())!=UForwardCharacterIterator.DONE) {
* // use c
* }
* }
* </code>
* </p>
*
*/
public interface UForwardCharacterIterator {
/**
* Indicator that we have reached the ends of the UTF16 text.
*/
public static final int DONE = -1;
/**
* Returns the UTF16 code unit at index, and increments to the next
* code unit (post-increment semantics). If index is out of
* range, DONE is returned, and the iterator is reset to the limit
* of the text.
* @return the next UTF16 code unit, or DONE if the index is at the limit
* of the text.
*/
public int next();
/**
* Returns the code point at index, and increments to the next code
* point (post-increment semantics). If index does not point to a
* valid surrogate pair, the behavior is the same as
* <code>next()<code>. Otherwise the iterator is incremented past
* the surrogate pair, and the code point represented by the pair
* is returned.
* @return the next codepoint in text, or DONE if the index is at
* the limit of the text.
*/
public int nextCodePoint();
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/USerializedSet.java,v $
* $Date: 2002/03/28 01:50:59 $
* $Revision: 1.2 $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -31,6 +31,8 @@ public final class USerializedSet {
arrayOffset=bmpLength=length=0;
length=src[srcStart++];
if((length&0x8000) >0) {
/* there are supplementary values */
length&=0x7fff;
@ -47,8 +49,9 @@ public final class USerializedSet {
}
bmpLength=length;
}
array=src;
arrayOffset=srcStart;
array = new char[length];
System.arraycopy(src,srcStart,array,0,length);
//arrayOffset=srcStart;
return true;
}
@ -83,9 +86,7 @@ public final class USerializedSet {
if(rangeIndex<0) {
return false;
}
if(array==null){
array = new char[8];
}
range=new int[2];
rangeIndex*=2; /* address start/limit pairs */
@ -122,7 +123,7 @@ public final class USerializedSet {
if( 0x10ffff<c) {
return;
}
if(c<0xffff) {
bmpLength=length=2;
array[0]=(char)c;
@ -157,7 +158,9 @@ public final class USerializedSet {
if(array==null){
array = new char[8];
}
range=new int[2];
if(range==null || range.length <2){
throw new IllegalArgumentException();
}
rangeIndex*=2; /* address start/limit pairs */
if(rangeIndex<bmpLength) {
range[0]=array[rangeIndex++];
@ -168,6 +171,7 @@ public final class USerializedSet {
} else {
range[1]=0x110000;
}
range[1]-=1;
return true;
} else {
rangeIndex-=bmpLength;
@ -182,7 +186,8 @@ public final class USerializedSet {
} else {
range[1]=0x110000;
}
return false;
range[1]-=1;
return true;
} else {
return false;
}
@ -216,6 +221,6 @@ public final class USerializedSet {
return (bmpLength+(length-bmpLength)/2+1)/2;
}
private char array[];
private char array[] = new char[8];
private int arrayOffset, bmpLength, length;
}

View file

@ -0,0 +1,339 @@
/*
*******************************************************************************
* Copyright (C) 1996-2000, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/UnicodeCharacterIterator.java,v $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.ReplaceableString;
import com.ibm.icu.text.UTF16;
import java.text.CharacterIterator;
/**
* Internal class that iterates through a com.ibm.text.Replacable text object
* to return either Unicode characters.
* @author synwee
* @version release 2.1, February 2002
*/
public final class UnicodeCharacterIterator implements CharacterIterator
{
// public data members -----------------------------------------------------
/**
* Indicator that we have reached the ends of the UTF16 text when returning
* 16 bit character.
*/
public static final int DONE = 0xFFFF;
/**
* Indicator that we have reached the ends of the UTF16 text when returning
* codepoints.
*/
public static final int DONE_CODEPOINT = -1;
// public constructor ------------------------------------------------------
/**
* Public constructor.
* By default the iteration range will be from 0 to the end of the text.
* @param replacable text which the iterator will be based on
*/
public UnicodeCharacterIterator(Replaceable replaceable)
{
m_replaceable_ = replaceable;
m_index_ = 0;
m_start_ = 0;
m_limit_ = replaceable.length();
}
/**
* Public constructor
* By default the iteration range will be from 0 to the end of the text.
* @param str text which the iterator will be based on
*/
public UnicodeCharacterIterator(String str)
{
m_replaceable_ = new ReplaceableString(str);
m_index_ = 0;
m_start_ = 0;
m_limit_ = m_replaceable_.length();
}
/**
* Constructs an iterator over the given range of the given string.
* @param text text to be iterated over
* @param start offset of the first character to iterate
* @param limit offset of the character following the last character to
* iterate
*/
public UnicodeCharacterIterator(String str, int start, int limit)
{
m_replaceable_ = new ReplaceableString(str);
m_start_ = start;
m_limit_ = limit;
m_index_ = m_start_;
}
/**
* Constructs an iterator over the given range of the given replaceable
* string.
* @param text text to be iterated over
* @param start offset of the first character to iterate
* @param limit offset of the character following the last character to
* iterate
*/
public UnicodeCharacterIterator(Replaceable replaceable, int start, int limit)
{
m_replaceable_ = replaceable;
m_start_ = start;
m_limit_ = limit;
m_index_ = m_start_;
}
// public methods ----------------------------------------------------------
/**
* Creates a copy of this iterator.
* Cloning will not duplicate a new Replaceable object.
* @return copy of this iterator
*/
public Object clone()
{
try {
return super.clone();
}
catch (CloneNotSupportedException e) {
throw new InternalError(
"Cloning by the super class java.text.CharacterIterator is not " +
"supported");
}
}
/**
* Returns the current UTF16 character.
* @return current UTF16 character
*/
public char current()
{
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
return m_replaceable_.charAt(m_index_);
}
return DONE;
}
/**
* Returns the current codepoint
* @return current codepoint
*/
public int currentCodePoint()
{
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
return m_replaceable_.char32At(m_index_);
}
return DONE_CODEPOINT;
}
/**
* Gets the first UTF16 character in text.
* @return the first UTF16 in text.
*/
public char first()
{
m_index_ = m_start_;
return current();
}
/**
* Returns the start of the text to iterate.
* @return by default this method will return 0, unless a range for
* iteration had been specified during construction.
*/
public int getBeginIndex()
{
return m_start_;
}
/**
* Returns the limit offset of the text to iterate
* @return by default this method returns the length of the text, unless a
* range for iteration had been specified during construction.
*/
public int getEndIndex()
{
return m_limit_;
}
/**
* Gets the current index in text.
* @return current index in text.
*/
public int getIndex()
{
return m_index_;
}
/**
* Gets the last UTF16 iterateable character from the text and shifts the
* index to the end of the text accordingly.
* @return the last UTF16 iterateable character
*/
public char last()
{
if (m_limit_ != m_start_) {
m_index_ = m_limit_ - 1;
return m_replaceable_.charAt(m_index_);
}
m_index_ = m_limit_;
return DONE;
}
/**
* Returns next UTF16 character and increments the iterator's index by 1.
* If the resulting index is greater or equal to the iteration limit, the
* index is reset to the text iteration limit and a value of DONE_CODEPOINT is
* returned.
* @return next UTF16 character in text or DONE if the new index is off the
* end of the text iteration limit.
*/
public char next()
{
if (m_index_ < m_limit_) {
char result = m_replaceable_.charAt(m_index_);
m_index_ ++;
return result;
}
return DONE;
}
/**
* Returns next codepoint after current index and increments the iterator's
* index by a number depending on the returned codepoint.
* This assumes the text is stored as 16-bit code units
* with surrogate pairs intermixed. If the index of a leading or trailing
* code unit of a surrogate pair is given, return the code point after the
* surrogate pair.
* If the resulting index is greater or equal to the text iterateable limit,
* the current index is reset to the text iterateable limit and a value of
* DONE_CODEPOINT is returned.
* @return next codepoint in text or DONE_CODEPOINT if the new index is off the
* end of the text iterateable limit.
*/
public int nextCodePoint()
{
if (m_index_ < m_limit_) {
char ch = m_replaceable_.charAt(m_index_);
m_index_ ++;
if (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
ch <= UTF16.LEAD_SURROGATE_MAX_VALUE &&
m_index_ < m_limit_) {
char trail = m_replaceable_.charAt(m_index_);
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
m_index_ ++;
return UCharacterProperty.getRawSupplementary(ch,
trail);
}
}
return ch;
}
return DONE_CODEPOINT;
}
/**
* Returns previous UTF16 character and decrements the iterator's index by
* 1.
* If the resulting index is less than the text iterateable limit, the
* index is reset to the start of the text iteration and a value of
* DONE_CODEPOINT is returned.
* @return next UTF16 character in text or DONE if the new index is off the
* start of the text iteration range.
*/
public char previous()
{
if (m_index_ > m_start_) {
m_index_ --;
return m_replaceable_.charAt(m_index_);
}
return DONE;
}
/**
* Returns previous codepoint before current index and decrements the
* iterator's index by a number depending on the returned codepoint.
* This assumes the text is stored as 16-bit code units
* with surrogate pairs intermixed. If the index of a leading or trailing
* code unit of a surrogate pair is given, return the code point before the
* surrogate pair.
* If the resulting index is less than the text iterateable range, the
* current index is reset to the start of the range and a value of
* DONE_CODEPOINT is returned.
* @return previous codepoint in text or DONE_CODEPOINT if the new index is
* off the start of the text iteration range.
*/
public int previousCodePoint()
{
if (m_index_ > m_start_) {
m_index_ --;
char ch = m_replaceable_.charAt(m_index_);
if (ch >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
ch <= UTF16.TRAIL_SURROGATE_MAX_VALUE &&
m_index_ > m_start_) {
char lead = m_replaceable_.charAt(m_index_);
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
m_index_ --;
return UCharacterProperty.getRawSupplementary(ch,
lead);
}
}
return ch;
}
return DONE_CODEPOINT;
}
/**
* <p>Sets the index to the specified index in the text and returns that
* single UTF16 character at index.
* This assumes the text is stored as 16-bit code units.</p>
* @param index the index within the text.
* @exception IllegalArgumentException is thrown if an invalid index is
* supplied. i.e. index is out of bounds.
* @return the character at the specified index or DONE if the specified
* index is equal to the limit of the text iteration range.
*/
public char setIndex(int index)
{
if (index < m_start_ || index > m_limit_) {
throw new IllegalArgumentException("Index index out of bounds");
}
m_index_ = index;
return current();
}
// private data members ----------------------------------------------------
/**
* Replacable object
*/
private Replaceable m_replaceable_;
/**
* Current index
*/
private int m_index_;
/**
* Start offset of iterateable range, by default this is 0
*/
private int m_start_;
/**
* Limit offset of iterateable range, by default this is the length of the
* string
*/
private int m_limit_;
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Utility.java,v $
* $Date: 2002/02/25 22:43:57 $
* $Revision: 1.23 $
* $Date: 2002/06/20 01:18:09 $
* $Revision: 1.24 $
*
*****************************************************************************************
*/
@ -91,6 +91,25 @@ public final class Utility {
return true;
}
/**
* Convenience utility to compare two Object[]s
* Ought to be in System.
* @param len the length to compare.
* The start indices and start+len must be valid.
*/
public final static boolean arrayRegionMatches(char[] source, int sourceStart,
char[] target, int targetStart,
int len)
{
int sourceEnd = sourceStart + len;
int delta = targetStart - sourceStart;
for (int i = sourceStart; i < sourceEnd; i++) {
if (source[i]!=target[i + delta])
return false;
}
return true;
}
/**
* Convenience utility to compare two int[]s.
* @param len the length to compare.

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:665f02a0fd842a47ca65ecf36c1d301ef5cae01990b68f05695cfc693a783406
size 106300
oid sha256:a5b2036d17d077b24f01e187e005a8cd3d84bfd9fea94c505eb24db9ca57492a
size 108044

View file

@ -5,14 +5,14 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/BOSCU.java,v $
* $Date: 2002/05/14 16:48:48 $
* $Revision: 1.1 $
* $Date: 2002/06/20 01:21:18 $
* $Revision: 1.2 $
*
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.UCharacterIterator;
import com.ibm.icu.impl.UnicodeCharacterIterator;
/**
* <p>Binary Ordered Compression Scheme for Unicode</p>
@ -105,9 +105,9 @@ public class BOSCU
int offset)
{
int prev = 0;
UCharacterIterator iterator = new UCharacterIterator(source);
UnicodeCharacterIterator iterator = new UnicodeCharacterIterator(source);
int codepoint = iterator.nextCodePoint();
while (codepoint != UCharacterIterator.DONE_CODEPOINT) {
while (codepoint != UnicodeCharacterIterator.DONE_CODEPOINT) {
if (prev < 0x4e00 || prev >= 0xa000) {
prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
}
@ -133,9 +133,9 @@ public class BOSCU
{
int prev = 0;
int result = 0;
UCharacterIterator iterator = new UCharacterIterator(source);
UnicodeCharacterIterator iterator = new UnicodeCharacterIterator(source);
int codepoint = iterator.nextCodePoint();
while (codepoint != UCharacterIterator.DONE_CODEPOINT) {
while (codepoint != UnicodeCharacterIterator.DONE_CODEPOINT) {
if (prev < 0x4e00 || prev >= 0xa000) {
prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CanonicalIterator.java,v $
* $Date: 2002/03/20 22:55:33 $
* $Revision: 1.9 $
* $Date: 2002/06/20 01:21:18 $
* $Revision: 1.10 $
*
*****************************************************************************************
*/
@ -17,7 +17,8 @@ import com.ibm.icu.lang.*;
import java.util.Enumeration;
import java.util.Vector;
import java.util.*;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.USerializedSet;
/**
* This class allows one to iterate through all the strings that are canonically equivalent to a given
* string. For example, here are some sample results:
@ -103,7 +104,7 @@ public class CanonicalIterator {
* while changing the source string, saving object creation.
*/
public void setSource(String newSource) {
source = Normalizer.normalize(newSource, Normalizer.DECOMP, 0);
source = Normalizer.normalize(newSource, Normalizer.NFD);
done = false;
// catch degenerate case
@ -122,9 +123,10 @@ public class CanonicalIterator {
// i should be the end of the first code point
int i = UTF16.findOffsetFromCodePoint(source, 1);
for (; i < source.length(); i += UTF16.getCharCount(i)) {
cp = UTF16.charAt(source, i);
if (SAFE_START.contains(cp)) {
if (NormalizerImpl.isCanonSafeStart(cp)) {
list.add(source.substring(start, i)); // add up to i
start = i;
}
@ -195,21 +197,21 @@ public class CanonicalIterator {
/**
*@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
*@internal
*/
*
public static UnicodeSet getSafeStart() {
return (UnicodeSet) SAFE_START.clone();
}
*/
/**
*@return the set of characters whose decompositions start with the given character
*@internal
*/
*
public static UnicodeSet getStarts(int cp) {
UnicodeSet result = AT_START.get(cp);
if (result == null) result = EMPTY;
return (UnicodeSet) result.clone();
}
*/
// ===================== PRIVATES ==============================
@ -253,7 +255,7 @@ public class CanonicalIterator {
String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
if (attempt.equals(segment)) {
*/
if (Normalizer.isEquivalent(possible, segment, Normalizer.DECOMP, 0)) {
if (Normalizer.compare(possible, segment,0)==0) {
if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
result.add(possible);
@ -272,6 +274,54 @@ public class CanonicalIterator {
private Set getEquivalents2(String segment) {
Set result = new HashSet();
if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
result.add(segment);
StringBuffer workingBuffer = new StringBuffer();
// cycle through all the characters
int cp=0,end=0;
int[] range = new int[2];
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
// see if any character is at the start of some decomposition
cp = UTF16.charAt(segment, i);;
USerializedSet starts = new USerializedSet();
if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
continue;
}
int j=0;
// if so, see which decompositions match
for(j = 0, cp = end+1; cp <= end ||starts.getSerializedRange(j++, range); ++cp) {
if(cp>end){
cp=range[0];
end=range[1];
}
Set remainder = extract(cp, segment, i,workingBuffer);
if (remainder == null) continue;
// there were some matches, so add all the possibilities to the set.
String prefix= segment.substring(0,i);
prefix += UTF16.valueOf(cp);
int el = -1;
Iterator iter = remainder.iterator();
while (iter.hasNext()) {
String item = (String) iter.next();
String toAdd = new String(prefix);
toAdd += item;
result.add(toAdd);
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
}
}
}
return result;
/*
Set result = new HashSet();
if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
result.add(segment);
@ -283,6 +333,7 @@ public class CanonicalIterator {
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
// see if any character is at the start of some decomposition
cp = UTF16.charAt(segment, i);
NormalizerImpl.getCanonStartSet(c,fillSet)
UnicodeSet starts = AT_START.get(cp);
if (starts == null) continue;
UnicodeSetIterator usi = new UnicodeSetIterator(starts);
@ -305,6 +356,7 @@ public class CanonicalIterator {
}
}
return result;
*/
}
/**
@ -317,7 +369,7 @@ public class CanonicalIterator {
+ ", " + NAME.transliterate(segment.substring(segmentPos)));
//String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
String decomp = Normalizer.normalize(comp, Normalizer.DECOMP, 0);
String decomp = Normalizer.normalize(comp, Normalizer.NFD);
// See if it matches the start of segment (at segmentPos)
boolean ok = false;
@ -369,7 +421,7 @@ public class CanonicalIterator {
if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
*/
if (!Normalizer.isEquivalent(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), Normalizer.DECOMP, 0)) return null;
if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
// get the remaining combinations
return getEquivalents2(remainder);
@ -392,16 +444,18 @@ public class CanonicalIterator {
SET_WITH_NULL_STRING.add("");
}
private static UnicodeSet SAFE_START = new UnicodeSet();
private static CharMap AT_START = new CharMap();
// private static UnicodeSet SAFE_START = new UnicodeSet();
// private static CharMap AT_START = new CharMap();
// TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
// Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
private static int LAST_UNICODE = 0x10FFFF;
// private static int LAST_UNICODE = 0x10FFFF;
/*
static {
buildData();
}
*/
/*
private static void buildData() {
if (PROGRESS) System.out.println("Getting Safe Start");
@ -417,10 +471,10 @@ public class CanonicalIterator {
for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
if (Normalizer.isNormalized(cp, Normalizer.DECOMP, 0)) continue;
if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
//String istr = UTF16.valueOf(cp);
String decomp = Normalizer.normalize(cp, Normalizer.DECOMP, 0);
String decomp = Normalizer.normalize(cp, Normalizer.NFD);
//if (decomp.equals(istr)) continue;
// add each character in the decomposition to canBeIn
@ -437,7 +491,7 @@ public class CanonicalIterator {
}
if (PROGRESS) System.out.println();
}
*/
// the following is just for a map from characters to a set of characters
private static class CharMap {

View file

@ -5,12 +5,14 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/ComposedCharIter.java,v $
* $Date: 2002/02/16 03:06:05 $
* $Revision: 1.3 $
* $Date: 2002/06/20 01:21:18 $
* $Revision: 1.4 $
*
*****************************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.Utility;
/**
* <tt>ComposedCharIter</tt> is an iterator class that returns all
@ -51,6 +53,7 @@ package com.ibm.icu.text;
* <tt>ComposedCharIter</tt> is currently based on version 2.1.8 of the
* <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
* It will be updated as later versions of Unicode are released.
* @deprecated
*/
public final class ComposedCharIter {
@ -59,7 +62,7 @@ public final class ComposedCharIter {
* {@link #next} returns this value when there are no more composed characters
* over which to iterate.
*/
public static final char DONE = Normalizer.DONE;
public static final char DONE = (char) Normalizer.DONE;
/**
* Construct a new <tt>ComposedCharIter</tt>. The iterator will return
@ -67,8 +70,8 @@ public final class ComposedCharIter {
* Hangul characters.
*/
public ComposedCharIter() {
minDecomp = DecompData.MAX_COMPAT;
hangul = false;
compat = false;
options =0;
}
@ -86,10 +89,8 @@ public final class ComposedCharIter {
* Jamo decompositions.
*/
public ComposedCharIter(boolean compat, int options) {
// Compatibility explosions have lower indices; skip them if necessary
minDecomp = compat ? 0 : DecompData.MAX_COMPAT;
hangul = (options & Normalizer.IGNORE_HANGUL) == 0;
this.compat = compat;
this.options = options;
}
/**
@ -97,10 +98,10 @@ public final class ComposedCharIter {
* by {@link #next}.
*/
public boolean hasNext() {
if (nextChar == DONE) {
if (nextChar == Normalizer.DONE) {
findNextChar();
}
return nextChar != DONE;
return nextChar != Normalizer.DONE;
}
/**
@ -111,12 +112,12 @@ public final class ComposedCharIter {
* to <tt>next</tt> will return {@link #DONE}.
*/
public char next() {
if (nextChar == DONE) {
if (nextChar == Normalizer.DONE) {
findNextChar();
}
curChar = nextChar;
nextChar = DONE;
return curChar;
nextChar = Normalizer.DONE;
return (char) curChar;
}
/**
@ -126,42 +127,38 @@ public final class ComposedCharIter {
* affected by the settings of the options passed to the constructor.
*/
public String decomposition() {
StringBuffer result = new StringBuffer();
int pos = (char)(DecompData.offsets.elementAt(curChar) & DecompData.DECOMP_MASK);
if (pos > minDecomp) {
Normalizer.doAppend(DecompData.contents, pos, result);
} else if (hangul && curChar >= HANGUL_BASE && curChar < HANGUL_LIMIT) {
Normalizer.hangulToJamo(curChar, result, minDecomp);
} else {
result.append(curChar);
}
return result.toString();
// the decomposition buffer contains the decomposition of
// current char so just return it
return new String(decompBuf,0, bufLen);
}
private void findNextChar() {
if (curChar != DONE) {
char ch = curChar;
while (++ch < 0xFFFF) {
int offset = DecompData.offsets.elementAt(ch) & DecompData.DECOMP_MASK;
if (offset > minDecomp
|| (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) ) {
nextChar = ch;
int c=curChar+1;
for(;;){
if(c < 0xFFFF){
bufLen = NormalizerImpl.getDecomposition(c,compat,
decompBuf,0,
decompBuf.length);
if(bufLen>0){
// the curChar can be decomposed... so it is a composed char
// cache the result
break;
}
}
}
c++;
}else{
c=Normalizer.DONE;
break;
}
}
nextChar=c;
}
private final int minDecomp;
private final boolean hangul;
private int options;
private boolean compat;
private char[] decompBuf = new char[100];
private int bufLen=0;
private int curChar = 0;
private int nextChar = Normalizer.DONE;
private char curChar = 0;
private char nextChar = Normalizer.DONE;
private static final char HANGUL_BASE = Normalizer.HANGUL_BASE;
private static final char HANGUL_LIMIT = Normalizer.HANGUL_LIMIT;
};

View file

@ -5,15 +5,15 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/LowercaseTransliterator.java,v $
* $Date: 2002/04/03 00:00:00 $
* $Revision: 1.10 $
* $Date: 2002/06/20 01:21:18 $
* $Revision: 1.11 $
*
*****************************************************************************************
*/
package com.ibm.icu.text;
import java.util.*;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.UCharacterIterator;
import com.ibm.icu.impl.UnicodeCharacterIterator;
/**
* A transliterator that performs locale-sensitive toLower()
@ -63,7 +63,7 @@ class LowercaseTransliterator extends Transliterator{
// get string for context
// TODO: add convenience method to do this, since we do it all over
UCharacterIterator original = new UCharacterIterator(text);
UnicodeCharacterIterator original = new UnicodeCharacterIterator(text);
// Walk through original string
// If there is a case change, modify corresponding position in replaceable

View file

@ -14,7 +14,7 @@ import com.ibm.icu.lang.*;
/**
* @author Alan Liu
* @version $RCSfile: NormalizationTransliterator.java,v $ $Revision: 1.17 $ $Date: 2002/02/25 22:43:58 $
* @version $RCSfile: NormalizationTransliterator.java,v $ $Revision: 1.18 $ $Date: 2002/06/20 01:21:18 $
*/
final class NormalizationTransliterator extends Transliterator {
@ -57,25 +57,25 @@ final class NormalizationTransliterator extends Transliterator {
Transliterator.registerFactory("Any-NFC", new Transliterator.Factory() {
public Transliterator getInstance(String ID) {
return NormalizationTransliterator.
getInstance(Normalizer.COMPOSE);
getInstance(Normalizer.NFC);
}
});
Transliterator.registerFactory("Any-NFD", new Transliterator.Factory() {
public Transliterator getInstance(String ID) {
return NormalizationTransliterator.
getInstance(Normalizer.DECOMP);
getInstance(Normalizer.NFD);
}
});
Transliterator.registerFactory("Any-NFKC", new Transliterator.Factory() {
public Transliterator getInstance(String ID) {
return NormalizationTransliterator.
getInstance(Normalizer.COMPOSE_COMPAT);
getInstance(Normalizer.NFKC);
}
});
Transliterator.registerFactory("Any-NFKD", new Transliterator.Factory() {
public Transliterator getInstance(String ID) {
return NormalizationTransliterator.
getInstance(Normalizer.DECOMP_COMPAT);
getInstance(Normalizer.NFKD);
}
});
Transliterator.registerSpecialInverse("NFC", "NFD", true);
@ -89,7 +89,21 @@ final class NormalizationTransliterator extends Transliterator {
int opt) {
StringBuffer id = new StringBuffer("NF");
int choice = 0;
if (m.compat()) {
if(m==Normalizer.NFC){
id.append("C");
choice |= C;
}else if(m==Normalizer.NFKC){
id.append("KC");
choice |= KC;
}else if(m==Normalizer.NFD){
id.append("D");
choice |= D;
}else if(m==Normalizer.NFKD){
id.append("KD");
choice |= KD;
}
/*if (m.compat()) {
id.append('K');
choice |= KD;
}
@ -98,7 +112,7 @@ final class NormalizationTransliterator extends Transliterator {
choice |= C;
} else {
id.append('D');
}
}*/
return new NormalizationTransliterator(id.toString(), m, choice, opt);
}
@ -185,7 +199,7 @@ final class NormalizationTransliterator extends Transliterator {
}
text.getChars(lastSafe, limit, buffer, 0);
String input = new String(buffer, 0, len); // TODO: fix normalizer to take char[]
String output = Normalizer.normalize(input, mode, options);
String output = Normalizer.normalize(input, mode);
// verify OK, if specified
if (verify != null) {

File diff suppressed because it is too large Load diff

View file

@ -3,13 +3,13 @@
* others. All Rights Reserved.
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TitlecaseTransliterator.java,v $
* $Date: 2002/04/02 23:59:59 $
* $Revision: 1.15 $
* $Date: 2002/06/20 01:21:18 $
* $Revision: 1.16 $
*/
package com.ibm.icu.text;
import java.util.*;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.UCharacterIterator;
import com.ibm.icu.impl.UnicodeCharacterIterator;
/**
* A transliterator that converts all letters (as defined by
@ -92,7 +92,7 @@ class TitlecaseTransliterator extends Transliterator {
// get string for context
// TODO: add convenience method to do this, since we do it all over
UCharacterIterator original = new UCharacterIterator(text);
UnicodeCharacterIterator original = new UnicodeCharacterIterator(text);
// Walk through original string
// If there is a case change, modify corresponding position in replaceable

View file

@ -4,8 +4,8 @@
* Corporation and others. All Rights Reserved.
**********************************************************************
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
* $Date: 2002/04/17 16:46:11 $
* $Revision: 1.21 $
* $Date: 2002/06/20 01:21:18 $
* $Revision: 1.22 $
**********************************************************************
*/
package com.ibm.icu.text;
@ -1334,13 +1334,13 @@ class TransliteratorParser {
p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
if (p >= 0) {
pragmaNormalizeRules(Normalizer.DECOMP);
pragmaNormalizeRules(Normalizer.NFD);
return p;
}
p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
if (p >= 0) {
pragmaNormalizeRules(Normalizer.COMPOSE);
pragmaNormalizeRules(Normalizer.NFC);
return p;
}

View file

@ -32,17 +32,17 @@ public class TransliteratorUtility {
// transliterators.
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
String ID = (String) e.nextElement();
showSourceSet(ID, Normalizer.NO_OP, false);
showSourceSet(ID, Normalizer.NONE, false);
}
} else {
// Usage: ID [NFKD | NFD] [lower]
Normalizer.Mode m = Normalizer.NO_OP;
Normalizer.Mode m = Normalizer.NONE;
boolean lowerFirst = false;
if (args.length >= 2) {
if (args[1].equalsIgnoreCase("NFD")) {
m = Normalizer.DECOMP;
m = Normalizer.NFD;
} else if (args[1].equalsIgnoreCase("NFKD")) {
m = Normalizer.DECOMP_COMPAT;
m = Normalizer.NFKD;
} else {
usage();
}
@ -87,7 +87,7 @@ public class TransliteratorUtility {
static void showSourceSetAux(Transliterator t, Normalizer.Mode m, boolean lowerFirst, boolean forward) throws IOException {
UnicodeSet sourceSet = t.getSourceSet();
if (m != Normalizer.NO_OP || lowerFirst) {
if (m != Normalizer.NONE || lowerFirst) {
UnicodeSetClosure.close(sourceSet, m, lowerFirst);
}
System.out.println(t.getID() + ": " +

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UTF16.java,v $
* $Date: 2002/05/14 23:45:46 $
* $Revision: 1.20 $
* $Date: 2002/06/20 01:21:18 $
* $Revision: 1.21 $
*
*******************************************************************************
*/
@ -14,6 +14,7 @@
package com.ibm.icu.text;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.NormalizerImpl;
/**
* Standalone utility class providing UTF16 character conversions and indexing
* conversions.
@ -2213,6 +2214,35 @@ public final class UTF16
return 0;
}
public int caseCompare(Object a, Object b, int options){
if (a == b) {
return 0;
}
if (a == null) {
return -1;
}
if (b == null) {
return 1;
}
String sa = (String) a;
String sb = (String) b;
int la = sa.length();
int lb = sb.length();
if( sa != sb ){
int result = NormalizerImpl.cmpEquivFold(sa,sb,
options|Normalizer.COMPARE_IGNORE_CASE);
if(result!=0) {
return (int)((byte)(result >> 24 | 1));
}
}else{
if(la != lb){
return (int)((byte)((la-lb) >> 24 | 1));
}
}
return 0;
}
}
// private data members -------------------------------------------------

View file

@ -5,15 +5,15 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UppercaseTransliterator.java,v $
* $Date: 2002/04/02 23:59:59 $
* $Revision: 1.9 $
* $Date: 2002/06/20 01:21:18 $
* $Revision: 1.10 $
*
*****************************************************************************************
*/
package com.ibm.icu.text;
import java.util.*;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.UCharacterIterator;
import com.ibm.icu.impl.UnicodeCharacterIterator;
/**
* A transliterator that performs locale-sensitive toUpper()
@ -59,7 +59,7 @@ class UppercaseTransliterator extends Transliterator {
// get string for context
// TODO: add convenience method to do this, since we do it all over
UCharacterIterator original = new UCharacterIterator(text);
UnicodeCharacterIterator original = new UnicodeCharacterIterator(text);
// Walk through original string
// If there is a case change, modify corresponding position in replaceable