mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 14:31:31 +00:00
ICU-1763 First cut of Normalizer code port from ICU4C
X-SVN-Rev: 8907
This commit is contained in:
parent
7c37ae1353
commit
c445874382
40 changed files with 25418 additions and 17631 deletions
1
.gitattributes
vendored
1
.gitattributes
vendored
|
@ -72,7 +72,6 @@ icu4j/src/com/ibm/icu/dev/data/ThaiWordFreq.xls -text
|
|||
icu4j/src/com/ibm/icu/dev/data/holidays_jp.ucs -text
|
||||
icu4j/src/com/ibm/icu/dev/data/rbbi/english.dict -text
|
||||
icu4j/src/com/ibm/icu/dev/data/thai6.ucs -text
|
||||
icu4j/src/com/ibm/icu/dev/data/unicode/Draft-TestSuite.txt -text
|
||||
icu4j/src/com/ibm/icu/impl/data/ICULocaleData.jar -text
|
||||
icu4j/src/com/ibm/icu/impl/data/thai_dict -text
|
||||
icu4j/src/com/ibm/icu/impl/data/ucadata.dat -text
|
||||
|
|
File diff suppressed because it is too large
Load diff
17035
icu4j/src/com/ibm/icu/dev/data/unicode/NormalizationTest.txt
Normal file
17035
icu4j/src/com/ibm/icu/dev/data/unicode/NormalizationTest.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/Attic/UCharacterIteratorTest.java,v $
|
||||
* $Date: 2002/04/03 00:00:00 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2002/06/20 01:16:00 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -15,7 +15,7 @@ package com.ibm.icu.dev.test.lang;
|
|||
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.UCharacterIterator;
|
||||
import com.ibm.icu.impl.UnicodeCharacterIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
|
@ -41,10 +41,10 @@ public final class UCharacterIteratorTest extends TestFmwk
|
|||
*/
|
||||
public void TestClone()
|
||||
{
|
||||
UCharacterIterator iterator = new UCharacterIterator("testing");
|
||||
UCharacterIterator cloned = (UCharacterIterator)iterator.clone();
|
||||
UnicodeCharacterIterator iterator = new UnicodeCharacterIterator("testing");
|
||||
UnicodeCharacterIterator cloned = (UnicodeCharacterIterator)iterator.clone();
|
||||
char completed = 0;
|
||||
while (completed != UCharacterIterator.DONE) {
|
||||
while (completed != UnicodeCharacterIterator.DONE) {
|
||||
completed = iterator.next();
|
||||
if (completed != cloned.next()) {
|
||||
errln("Cloned operation failed");
|
||||
|
@ -57,9 +57,9 @@ public final class UCharacterIteratorTest extends TestFmwk
|
|||
*/
|
||||
public void TestIteration()
|
||||
{
|
||||
UCharacterIterator iterator = new UCharacterIterator(
|
||||
UnicodeCharacterIterator iterator = new UnicodeCharacterIterator(
|
||||
ITERATION_STRING_);
|
||||
UCharacterIterator iterator2 = new UCharacterIterator(
|
||||
UnicodeCharacterIterator iterator2 = new UnicodeCharacterIterator(
|
||||
ITERATION_STRING_);
|
||||
if (iterator.first() != ITERATION_STRING_.charAt(0)) {
|
||||
errln("Iterator failed retrieving first character");
|
||||
|
@ -75,12 +75,12 @@ public final class UCharacterIteratorTest extends TestFmwk
|
|||
iterator2.setIndex(0);
|
||||
iterator.setIndex(0);
|
||||
int ch = 0;
|
||||
while (ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
int index = iterator2.getIndex();
|
||||
ch = iterator2.nextCodePoint();
|
||||
if (index != ITERATION_SUPPLEMENTARY_INDEX) {
|
||||
if (ch != (int)iterator.next() &&
|
||||
ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
errln("Error mismatch in next() and nextCodePoint()");
|
||||
}
|
||||
}
|
||||
|
@ -94,12 +94,12 @@ public final class UCharacterIteratorTest extends TestFmwk
|
|||
}
|
||||
iterator.setIndex(ITERATION_STRING_.length());
|
||||
iterator2.setIndex(ITERATION_STRING_.length());
|
||||
while (ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
int index = iterator2.getIndex();
|
||||
ch = iterator2.previousCodePoint();
|
||||
if (index != ITERATION_SUPPLEMENTARY_INDEX) {
|
||||
if (ch != (int)iterator.previous() &&
|
||||
ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
errln("Error mismatch in previous() and " +
|
||||
"previousCodePoint()");
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,9 +1,15 @@
|
|||
/*
|
||||
************************************************************************
|
||||
* Copyright (c) 1997-2000, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
************************************************************************
|
||||
*/
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2000, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/ConformanceTest.java,v $
|
||||
* $Date: 2002/06/20 01:16:24 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.dev.test.normalizer;
|
||||
|
||||
|
@ -13,6 +19,7 @@ import com.ibm.icu.dev.test.*;
|
|||
import com.ibm.icu.lang.*;
|
||||
import com.ibm.icu.text.*;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
|
||||
public class ConformanceTest extends TestFmwk {
|
||||
|
||||
|
@ -21,28 +28,28 @@ public class ConformanceTest extends TestFmwk {
|
|||
public static void main(String[] args) throws Exception {
|
||||
new ConformanceTest().run(args);
|
||||
}
|
||||
|
||||
|
||||
public ConformanceTest() {
|
||||
// Doesn't matter what the string and mode are; we'll change
|
||||
// them later as needed.
|
||||
normalizer = new Normalizer("", Normalizer.COMPOSE);
|
||||
normalizer = new Normalizer("", Normalizer.NFC);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the conformance of Normalizer to
|
||||
* Test the conformance of NewNormalizer to
|
||||
* http://www.unicode.org/unicode/reports/tr15/conformance/Draft-TestSuite.txt.
|
||||
* This file must be located at the path specified as TEST_SUITE_FILE.
|
||||
*/
|
||||
public void TestConformance() {
|
||||
BufferedReader input = null;
|
||||
public void TestConformance() throws Exception{
|
||||
BufferedReader input = null;
|
||||
String line = null;
|
||||
String[] fields = new String[5];
|
||||
StringBuffer buf = new StringBuffer();
|
||||
int passCount = 0;
|
||||
int failCount = 0;
|
||||
|
||||
InputStream is = null;
|
||||
try {
|
||||
input = TestUtil.getDataReader("unicode/Draft-TestSuite.txt");
|
||||
input = TestUtil.getDataReader("unicode/NormalizationTest.txt");
|
||||
for (int count = 0;;++count) {
|
||||
line = input.readLine();
|
||||
if (line == null) break;
|
||||
|
@ -52,7 +59,7 @@ public class ConformanceTest extends TestFmwk {
|
|||
// 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
|
||||
|
||||
// Skip comments
|
||||
if (line.charAt(0) == '#') continue;
|
||||
if (line.charAt(0) == '#' || line.charAt(0)=='@') continue;
|
||||
|
||||
// Parse out the fields
|
||||
hexsplit(line, ';', fields, buf);
|
||||
|
@ -101,46 +108,139 @@ public class ConformanceTest extends TestFmwk {
|
|||
* @param line the source line from the test suite file
|
||||
* @return true if the test passes
|
||||
*/
|
||||
private boolean checkConformance(String[] field, String line) {
|
||||
private boolean checkConformance(String[] field, String line) throws Exception{
|
||||
boolean pass = true;
|
||||
StringBuffer buf = new StringBuffer(); // scratch
|
||||
String out;
|
||||
|
||||
for (int i=0; i<5; ++i) {
|
||||
String out,fcd;
|
||||
int i=0;
|
||||
UTF16.StringComparator comp = new UTF16.StringComparator();
|
||||
for (i=0; i<5; ++i) {
|
||||
if (i<3) {
|
||||
out = Normalizer.normalize(field[i], Normalizer.COMPOSE, 0);
|
||||
out = Normalizer.normalize(field[i], Normalizer.NFC);
|
||||
pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
out = iterativeNorm(field[i], Normalizer.COMPOSE, buf, +1);
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFC, buf, +1);
|
||||
pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
out = iterativeNorm(field[i], Normalizer.COMPOSE, buf, -1);
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFC, buf, -1);
|
||||
pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
|
||||
out = Normalizer.normalize(field[i], Normalizer.DECOMP, 0);
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, +1);
|
||||
pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, -1);
|
||||
pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
|
||||
out = Normalizer.normalize(field[i], Normalizer.NFD);
|
||||
pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
out = iterativeNorm(field[i], Normalizer.DECOMP, buf, +1);
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFD, buf, +1);
|
||||
pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
out = iterativeNorm(field[i], Normalizer.DECOMP, buf, -1);
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFD, buf, -1);
|
||||
pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, +1);
|
||||
pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, -1);
|
||||
pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
|
||||
}
|
||||
out = Normalizer.normalize(field[i], Normalizer.COMPOSE_COMPAT, 0);
|
||||
out = Normalizer.normalize(field[i], Normalizer.NFKC);
|
||||
pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
out = iterativeNorm(field[i], Normalizer.COMPOSE_COMPAT, buf, +1);
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFKC, buf, +1);
|
||||
pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
out = iterativeNorm(field[i], Normalizer.COMPOSE_COMPAT, buf, -1);
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFKC, buf, -1);
|
||||
pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
|
||||
out = Normalizer.normalize(field[i], Normalizer.DECOMP_COMPAT, 0);
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, +1);
|
||||
pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, -1);
|
||||
pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
|
||||
|
||||
out = Normalizer.normalize(field[i], Normalizer.NFKD);
|
||||
pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
out = iterativeNorm(field[i], Normalizer.DECOMP_COMPAT, buf, +1);
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFKD, buf, +1);
|
||||
pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
out = iterativeNorm(field[i], Normalizer.DECOMP_COMPAT, buf, -1);
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFKD, buf, -1);
|
||||
pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, +1);
|
||||
pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, -1);
|
||||
pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
|
||||
}
|
||||
// test quick checks
|
||||
if(Normalizer.NO == Normalizer.quickCheck(field[1], Normalizer.NFC)) {
|
||||
errln("Normalizer error: quickCheck(NFC(s), NewNormalizer.NFC) is NewNormalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
if(Normalizer.NO == Normalizer.quickCheck(field[2], Normalizer.NFD)) {
|
||||
errln("Normalizer error: quickCheck(NFD(s), NewNormalizer.NFD) is NewNormalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
if(Normalizer.NO == Normalizer.quickCheck(field[3], Normalizer.NFKC)) {
|
||||
errln("Normalizer error: quickCheck(NFKC(s), NewNormalizer.NFKC) is NewNormalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
if(Normalizer.NO == Normalizer.quickCheck(field[4], Normalizer.NFKD)) {
|
||||
errln("Normalizer error: quickCheck(NFKD(s), NewNormalizer.NFKD) is NewNormalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
|
||||
if(!Normalizer.isNormalized(field[1], Normalizer.NFC)) {
|
||||
errln("Normalizer error: isNormalized(NFC(s), NewNormalizer.NFC) is false");
|
||||
pass = false;
|
||||
}
|
||||
if(!field[0].equals(field[1]) && Normalizer.isNormalized(field[0], Normalizer.NFC)) {
|
||||
errln("Normalizer error: isNormalized(s, NewNormalizer.NFC) is TRUE");
|
||||
pass = false;
|
||||
}
|
||||
if(!Normalizer.isNormalized(field[3], Normalizer.NFKC)) {
|
||||
errln("Normalizer error: isNormalized(NFKC(s), NewNormalizer.NFKC) is false");
|
||||
pass = false;
|
||||
}
|
||||
if(!field[0].equals(field[3]) && Normalizer.isNormalized(field[0], Normalizer.NFKC)) {
|
||||
errln("Normalizer error: isNormalized(s, NewNormalizer.NFKC) is TRUE");
|
||||
pass = false;
|
||||
}
|
||||
|
||||
// test FCD quick check and "makeFCD"
|
||||
fcd=Normalizer.normalize(field[0], Normalizer.FCD);
|
||||
if(Normalizer.NO == Normalizer.quickCheck(fcd, Normalizer.FCD)) {
|
||||
errln("Normalizer error: quickCheck(FCD(s), NewNormalizer.FCD) is NewNormalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
if(Normalizer.NO == Normalizer.quickCheck(field[2], Normalizer.FCD)) {
|
||||
errln("Normalizer error: quickCheck(NFD(s), NewNormalizer.FCD) is NewNormalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
if(Normalizer.NO == Normalizer.quickCheck(field[4], Normalizer.FCD)) {
|
||||
errln("Normalizer error: quickCheck(NFKD(s), NewNormalizer.FCD) is NewNormalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
|
||||
out=Normalizer.normalize(fcd, Normalizer.NFD);
|
||||
if(!out.equals(field[2])) {
|
||||
errln("Normalizer error: NFD(FCD(s))!=NFD(s)");
|
||||
pass = false;
|
||||
}
|
||||
if (!pass) {
|
||||
errln("FAIL: " + line);
|
||||
}
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Do a normalization using the iterative API in the given direction.
|
||||
|
@ -148,20 +248,48 @@ public class ConformanceTest extends TestFmwk {
|
|||
* @param dir either +1 or -1
|
||||
*/
|
||||
private String iterativeNorm(String str, Normalizer.Mode mode,
|
||||
StringBuffer buf, int dir) {
|
||||
StringBuffer buf, int dir) throws Exception{
|
||||
normalizer.setText(str);
|
||||
normalizer.setMode(mode);
|
||||
buf.setLength(0);
|
||||
char ch;
|
||||
|
||||
int ch;
|
||||
if (dir > 0) {
|
||||
for (ch = normalizer.first(); ch != Normalizer.DONE;
|
||||
ch = normalizer.next()) {
|
||||
buf.append(ch);
|
||||
buf.append(UTF16.toString(ch));
|
||||
}
|
||||
} else {
|
||||
for (ch = normalizer.last(); ch != Normalizer.DONE;
|
||||
ch = normalizer.previous()) {
|
||||
buf.insert(0, ch);
|
||||
buf.insert(0, UTF16.toString(ch));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Do a normalization using the iterative API in the given direction.
|
||||
* @param str a Java StringCharacterIterator
|
||||
* @param buf scratch buffer
|
||||
* @param dir either +1 or -1
|
||||
*/
|
||||
private String iterativeNorm(StringCharacterIterator str, Normalizer.Mode mode,
|
||||
StringBuffer buf, int dir) throws Exception{
|
||||
normalizer.setText(str);
|
||||
normalizer.setMode(mode);
|
||||
buf.setLength(0);
|
||||
|
||||
int ch;
|
||||
if (dir > 0) {
|
||||
for (ch = normalizer.first(); ch != Normalizer.DONE;
|
||||
ch = normalizer.next()) {
|
||||
buf.append(UTF16.toString(ch));
|
||||
}
|
||||
} else {
|
||||
for (ch = normalizer.last(); ch != Normalizer.DONE;
|
||||
ch = normalizer.previous()) {
|
||||
buf.insert(0, UTF16.toString(ch));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
|
@ -180,8 +308,8 @@ public class ConformanceTest extends TestFmwk {
|
|||
if (exp.equals(got)) {
|
||||
return true;
|
||||
}
|
||||
errln(Utility.escape(" " + msg + ") " + op + "(" + s + ")=" + got +
|
||||
", exp. " + exp));
|
||||
errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got) +
|
||||
", exp. " + hex(exp)));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -207,21 +335,26 @@ public class ConformanceTest extends TestFmwk {
|
|||
}
|
||||
// Our field is from pos..delim-1.
|
||||
buf.setLength(0);
|
||||
while (pos < delim) {
|
||||
if (s.charAt(pos) == ' ') {
|
||||
++pos;
|
||||
} else if (pos+4 > delim) {
|
||||
throw new IllegalArgumentException("Premature eol in " + s);
|
||||
} else {
|
||||
int hex = Integer.parseInt(s.substring(pos, pos+4), 16);
|
||||
if (hex < 0 || hex > 0xFFFF) {
|
||||
throw new IllegalArgumentException("Out of range hex " +
|
||||
hex + " in " + s);
|
||||
|
||||
String toHex = s.substring(pos,delim);
|
||||
pos = delim;
|
||||
int index = 0;
|
||||
int len = toHex.length();
|
||||
while(index< len){
|
||||
if(toHex.charAt(index)==' '){
|
||||
index++;
|
||||
}else{
|
||||
int spacePos = toHex.indexOf(' ', index);
|
||||
if(spacePos==-1){
|
||||
appendInt(buf,toHex.substring(index,len),s);
|
||||
spacePos = len;
|
||||
}else{
|
||||
appendInt(buf,toHex.substring(index, spacePos),s);
|
||||
}
|
||||
buf.append((char) hex);
|
||||
pos += 4;
|
||||
index = spacePos+1;
|
||||
}
|
||||
}
|
||||
|
||||
if (buf.length() < 1) {
|
||||
throw new IllegalArgumentException("Empty field " + i + " in " + s);
|
||||
}
|
||||
|
@ -229,17 +362,29 @@ public class ConformanceTest extends TestFmwk {
|
|||
++pos; // Skip over delim
|
||||
}
|
||||
}
|
||||
|
||||
public static void appendInt(StringBuffer buf, String strToHex, String s){
|
||||
int hex = Integer.parseInt(strToHex,16);
|
||||
if (hex < 0 ) {
|
||||
throw new IllegalArgumentException("Out of range hex " +
|
||||
hex + " in " + s);
|
||||
}else if (hex > 0xFFFF){
|
||||
buf.append((char)((hex>>10)+0xd7c0));
|
||||
buf.append((char)((hex&0x3ff)|0xdc00));
|
||||
}else{
|
||||
buf.append((char) hex);
|
||||
}
|
||||
}
|
||||
|
||||
// Specific tests for debugging. These are generally failures
|
||||
// taken from the conformance file, but culled out to make
|
||||
// debugging easier. These can be eliminated without affecting
|
||||
// coverage.
|
||||
|
||||
public void _hideTestCase6() {
|
||||
public void _hideTestCase6() throws Exception{
|
||||
_testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;");
|
||||
}
|
||||
|
||||
public void _testOneLine(String line) {
|
||||
public void _testOneLine(String line) throws Exception{
|
||||
String[] fields = new String[5];
|
||||
StringBuffer buf = new StringBuffer();
|
||||
// Parse out the fields
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/Attic/ExhaustiveTest.java,v $
|
||||
* $Date: 2002/03/01 18:48:01 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2002/06/20 01:16:24 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -15,46 +15,38 @@ package com.ibm.icu.dev.test.normalizer;
|
|||
import com.ibm.icu.dev.test.*;
|
||||
import com.ibm.icu.lang.*;
|
||||
import com.ibm.icu.text.*;
|
||||
import com.ibm.icu.dev.tool.normalizer.UInfo;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
|
||||
public class ExhaustiveTest extends TestFmwk
|
||||
{
|
||||
private UInfo info;
|
||||
|
||||
|
||||
public static void main(String[] args) throws Exception
|
||||
{
|
||||
UInfo tempInfo = null;
|
||||
String[] tempArgs = new String[args.length];
|
||||
String[] tempArgs = new String[args.length];
|
||||
int count = 0;
|
||||
|
||||
// Allow the test to be pointed at a specific version of the Unicode database
|
||||
for (int i = 0; i < args.length; i++)
|
||||
{
|
||||
if (args[i].equals("-data")) {
|
||||
tempInfo = new UInfo(args[++i], args[++i]);
|
||||
} else {
|
||||
tempArgs[count++] = args[i];
|
||||
}
|
||||
}
|
||||
//for (int i = 0; i < args.length; i++)
|
||||
//{
|
||||
// if (args[i].equals("-data")) {
|
||||
// tempInfo = new UInfo(args[++i], args[++i]);
|
||||
// } else {
|
||||
// tempArgs[count++] = args[i];
|
||||
// }
|
||||
//}
|
||||
|
||||
args = new String[count];
|
||||
System.arraycopy(tempArgs, 0, args, 0, count);
|
||||
|
||||
|
||||
if (tempInfo == null) {
|
||||
tempInfo = new UInfo();
|
||||
}
|
||||
new ExhaustiveTest(tempInfo).run(args);
|
||||
|
||||
new ExhaustiveTest().run(args);
|
||||
}
|
||||
|
||||
public ExhaustiveTest() {
|
||||
this.info = new UInfo();
|
||||
}
|
||||
|
||||
public ExhaustiveTest(UInfo info) {
|
||||
this.info = info;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Run through all of the characters returned by a composed-char iterator
|
||||
|
@ -89,7 +81,7 @@ public class ExhaustiveTest extends TestFmwk
|
|||
// make sense
|
||||
String chString = new StringBuffer().append(ch).toString();
|
||||
String iterDecomp = iter.decomposition();
|
||||
String normDecomp = Normalizer.decompose(chString, compat, 0);
|
||||
String normDecomp = Normalizer.decompose(chString, compat);
|
||||
|
||||
if (iterDecomp.equals(chString)) {
|
||||
errln("ERROR: " + hex(ch) + " has identical decomp");
|
||||
|
@ -106,7 +98,7 @@ public class ExhaustiveTest extends TestFmwk
|
|||
{
|
||||
for (char x = ++start; x < limit; x++) {
|
||||
String xString = new StringBuffer().append(x).toString();
|
||||
String decomp = Normalizer.decompose(xString, compat, options);
|
||||
String decomp = Normalizer.decompose(xString, compat);
|
||||
if (!decomp.equals(xString)) {
|
||||
errln("ERROR: " + hex(x) + " has decomposition (" + hex(decomp) + ")"
|
||||
+ " but was not returned by iterator");
|
||||
|
@ -124,26 +116,31 @@ public class ExhaustiveTest extends TestFmwk
|
|||
char ch = iter.next();
|
||||
|
||||
String chStr = new StringBuffer().append(ch).toString();
|
||||
String decomp = Normalizer.decompose(chStr, compat, options);
|
||||
String comp = Normalizer.compose(decomp, compat, options);
|
||||
String decomp = Normalizer.decompose(chStr, compat);
|
||||
String comp = Normalizer.compose(decomp, compat);
|
||||
|
||||
short cClass = info.getCanonicalClass(decomp.charAt(0));
|
||||
int cClass = UCharacter.getCombiningClass(decomp.charAt(0));
|
||||
cClass = 0;
|
||||
|
||||
if (info.isExcludedComposition(ch)) {
|
||||
logln("Skipped excluded char " + hex(ch) + " (" + info.getName(ch,true) + ")" );
|
||||
if (NormalizerImpl.isFullCompositionExclusion(ch)) {
|
||||
logln("Skipped excluded char " + hex(ch) + " (" + UCharacter.getName(ch) + ")" );
|
||||
continue;
|
||||
}
|
||||
|
||||
// Avoid disparaged characters
|
||||
if (info.getDecomposition(ch).length() == 4) continue;
|
||||
if (getDecomposition(ch,compat).length() == 4) continue;
|
||||
|
||||
if (!comp.equals(chStr)) {
|
||||
errln("ERROR: Round trip invalid: " + hex(chStr) + " --> " + hex(decomp)
|
||||
+ " --> " + hex(comp));
|
||||
|
||||
errln(" char decomp is '" + info.getDecomposition(ch) + "'");
|
||||
errln(" char decomp is '" + getDecomposition(ch,compat) + "'");
|
||||
}
|
||||
}
|
||||
}
|
||||
private String getDecomposition(char ch, boolean compat){
|
||||
char[] dest = new char[10];
|
||||
int length = NormalizerImpl.getDecomposition(ch,compat,dest,0,dest.length);
|
||||
return new String(dest,0,length);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/normalizer/TestCanonicalIterator.java,v $
|
||||
* $Date: 2002/03/19 00:18:44 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2002/06/20 01:16:24 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -39,6 +39,7 @@ public class TestCanonicalIterator extends TestFmwk {
|
|||
{"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
|
||||
};
|
||||
|
||||
|
||||
public void TestExhaustive() {
|
||||
int counter = 0;
|
||||
int mixedCounter = 0;
|
||||
|
@ -63,8 +64,8 @@ public class TestCanonicalIterator extends TestFmwk {
|
|||
if ((++counter % 5000) == 0) logln("Testing " + Utility.hex(i,0));
|
||||
|
||||
String s = UTF16.valueOf(i) + "\u0345";
|
||||
String decomp = Normalizer.decompose(s, false, 0);
|
||||
String comp = Normalizer.compose(s, false, 0);
|
||||
String decomp = Normalizer.decompose(s, false);
|
||||
String comp = Normalizer.compose(s, false);
|
||||
// skip characters that don't have either decomp.
|
||||
// need quick test for this!
|
||||
if (s.equals(decomp) && s.equals(comp)) continue;
|
||||
|
@ -170,14 +171,17 @@ public class TestCanonicalIterator extends TestFmwk {
|
|||
}
|
||||
|
||||
public void TestBasic() {
|
||||
// check build
|
||||
UnicodeSet ss = CanonicalIterator.getSafeStart();
|
||||
logln("Safe Start: " + ss.toPattern(true));
|
||||
ss = CanonicalIterator.getStarts('a');
|
||||
expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
|
||||
new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
|
||||
+ "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
|
||||
);
|
||||
// This is not interesting anymore as the data is already built
|
||||
// beforehand
|
||||
|
||||
// check build
|
||||
// UnicodeSet ss = CanonicalIterator.getSafeStart();
|
||||
// logln("Safe Start: " + ss.toPattern(true));
|
||||
// ss = CanonicalIterator.getStarts('a');
|
||||
// expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
|
||||
// new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
|
||||
// + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
|
||||
// );
|
||||
|
||||
// check permute
|
||||
// NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
|
||||
|
|
|
@ -98,14 +98,14 @@ public class RoundTripTest extends TestFmwk {
|
|||
public void TestGreekUNGEGN() throws IOException, ParseException {
|
||||
new Test("Latin-Greek/UNGEGN")
|
||||
.test("[a-zA-Z]", "[\u003B\u00B7[:Greek:]-[\u03D7-\u03EF]]",
|
||||
"[\u00B5\u037A\u03D0-\uFFFF]", /* roundtrip exclusions */
|
||||
"[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */
|
||||
this, new LegalGreek(false));
|
||||
}
|
||||
|
||||
public void Testel() throws IOException, ParseException {
|
||||
new Test("Latin-el")
|
||||
.test("[a-zA-Z]", "[\u003B\u00B7[:Greek:]-[\u03D7-\u03EF]]",
|
||||
"[\u00B5\u037A\u03D0-\uFFFF]", /* roundtrip exclusions */
|
||||
"[\u00B5\u037A\u03D0-\uFFFF{\u039C\u03C0}]", /* roundtrip exclusions */
|
||||
this, new LegalGreek(false));
|
||||
}
|
||||
|
||||
|
@ -136,7 +136,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
String nukta = "\u093c\u09bc\u0a3c\u0abc\u0b3c";
|
||||
String virama = "\u094d\u09cd\u0a4d\u0acd\u0b4d\u0bcd\u0c4d\u0ccd\u0d4d";
|
||||
String sanskritStressSigns = "\u0951\u0952\u0953\u0954";
|
||||
String chandrabindu = "\u0901\u0981\u0A81\u0b01";
|
||||
String chandrabindu = "\u0901\u0981\u0A81\u0b01\u0c01";
|
||||
public boolean is(String sourceString){
|
||||
int cp=sourceString.charAt(0);
|
||||
|
||||
|
@ -221,7 +221,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "Tamil-DEVANAGARI",
|
||||
"[:tamil:]", "[:Devanagari:]",
|
||||
"[\u093c\u0943-\u094a\u0951-\u0954\u0962\u0963\u090B\u090C\u090D\u0911\u0916\u0917\u0918\u091B\u091D\u0920\u0921\u0922\u0925\u0926\u0927\u092B\u092C\u092D\u0936\u093d\u0950[\u0958-\u0961]]", /*roundtrip exclusions*/
|
||||
"[\u0901\u093c\u0943-\u094a\u0951-\u0954\u0962\u0963\u090B\u090C\u090D\u0911\u0916\u0917\u0918\u091B\u091D\u0920\u0921\u0922\u0925\u0926\u0927\u092B\u092C\u092D\u0936\u093d\u0950[\u0958-\u0961]]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "DEVANAGARI-Tamil",
|
||||
"[:Devanagari:]", "[:tamil:]",
|
||||
|
@ -239,7 +239,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "KANNADA-DEVANAGARI",
|
||||
"[:KANNADA:]", "[:Devanagari:]",
|
||||
"[\u0946\u093c\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
|
||||
"[\u0901\u0946\u093c\u0950\u0945\u0949\u0951-\u0954\u0962\u0963\u0950\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "DEVANAGARI-KANNADA",
|
||||
"[:Devanagari:]", "[:KANNADA:]",
|
||||
|
@ -248,7 +248,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "MALAYALAM-DEVANAGARI",
|
||||
"[:MALAYALAM:]", "[:Devanagari:]",
|
||||
"[\u094a\u094b\u094c\u093c\u0950\u0944\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
|
||||
"[\u0901\u094a\u094b\u094c\u093c\u0950\u0944\u0945\u0949\u0951-\u0954\u0962\u0963\u090D\u0911\u093d\u0929\u0934[\u0958-\u095f]]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "DEVANAGARI-MALAYALAM",
|
||||
"[:Devanagari:]", "[:MALAYALAM:]",
|
||||
|
@ -284,7 +284,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "Tamil-BENGALI",
|
||||
"[:tamil:]", "[:BENGALI:]",
|
||||
"[\u09bc\u09c3\u09c4\u09e2\u09e3\u09f0\u09f1\u098B\u098C\u0996\u0997\u0998\u099B\u099D\u09A0\u09A1\u09A2\u09A5\u09A6\u09A7\u09AB\u09AC\u09AD\u09B6\u09DC\u09DD\u09DF\u09E0\u09E1]", /*roundtrip exclusions*/
|
||||
"[\u0981\u09bc\u09c3\u09c4\u09e2\u09e3\u09f0\u09f1\u098B\u098C\u0996\u0997\u0998\u099B\u099D\u09A0\u09A1\u09A2\u09A5\u09A6\u09A7\u09AB\u09AC\u09AD\u09B6\u09DC\u09DD\u09DF\u09E0\u09E1]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "BENGALI-Tamil",
|
||||
"[:BENGALI:]", "[:tamil:]",
|
||||
|
@ -302,7 +302,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "KANNADA-BENGALI",
|
||||
"[:KANNADA:]", "[:BENGALI:]",
|
||||
"[\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df]", /*roundtrip exclusions*/
|
||||
"[\u0981\u09e2\u09e3\u09bc\u09d7\u09f0\u09f1\u09dc\u09dd\u09df]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "BENGALI-KANNADA",
|
||||
"[:BENGALI:]", "[:KANNADA:]",
|
||||
|
@ -311,7 +311,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "MALAYALAM-BENGALI",
|
||||
"[:MALAYALAM:]", "[:BENGALI:]",
|
||||
"[\u09e2\u09e3\u09bc\u09c4\u09f0\u09f1\u09dc\u09dd\u09df]", /*roundtrip exclusions*/
|
||||
"[\u0981\u09e2\u09e3\u09bc\u09c4\u09f0\u09f1\u09dc\u09dd\u09df]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "BENGALI-MALAYALAM",
|
||||
"[:BENGALI:]", "[:MALAYALAM:]",
|
||||
|
@ -382,7 +382,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "TAMIL-GUJARATI",
|
||||
"[:TAMIL:]", "[:GUJARATI:]",
|
||||
"[\u0abc\u0ac3\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8B\u0A8D\u0A91\u0A96\u0A97\u0A98\u0A9B\u0A9D\u0AA0\u0AA1\u0AA2\u0AA5\u0AA6\u0AA7\u0AAB\u0AAC\u0AAD\u0AB6\u0ABD\u0AD0\u0AE0]", /*roundtrip exclusions*/
|
||||
"[\u0A81\u0abc\u0ac3\u0Ac4\u0Ac5\u0Ac9\u0Ac7\u0A8B\u0A8D\u0A91\u0A96\u0A97\u0A98\u0A9B\u0A9D\u0AA0\u0AA1\u0AA2\u0AA5\u0AA6\u0AA7\u0AAB\u0AAC\u0AAD\u0AB6\u0ABD\u0AD0\u0AE0]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "GUJARATI-TAMIL",
|
||||
"[:GUJARATI:]", "[:TAMIL:]",
|
||||
|
@ -400,7 +400,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "KANNADA-GUJARATI",
|
||||
"[:KANNADA:]", "[:GUJARATI:]",
|
||||
"[\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
|
||||
"[\u0A81\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "GUJARATI-KANNADA",
|
||||
"[:GUJARATI:]", "[:KANNADA:]",
|
||||
|
@ -409,7 +409,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "MALAYALAM-GUJARATI",
|
||||
"[:MALAYALAM:]", "[:GUJARATI:]",
|
||||
"[\u0ac4\u0acb\u0acc\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
|
||||
"[\u0A81\u0ac4\u0acb\u0acc\u0abc\u0Ac5\u0Ac9\u0A8D\u0A91\u0ABD\u0Ad0]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "GUJARATI-MALAYALAM",
|
||||
"[:GUJARATI:]", "[:MALAYALAM:]",
|
||||
|
@ -418,7 +418,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "TAMIL-ORIYA",
|
||||
"[:TAMIL:]", "[:ORIYA:]",
|
||||
"[\u0b3c\u0b43\u0b56\u0B0B\u0B0C\u0B16\u0B17\u0B18\u0B1B\u0B1D\u0B20\u0B21\u0B22\u0B25\u0B26\u0B27\u0B2B\u0B2C\u0B2D\u0B36\u0B3D\u0B5C\u0B5D\u0B5F\u0B60\u0B61]", /*roundtrip exclusions*/
|
||||
"[\u0B01\u0b3c\u0b43\u0b56\u0B0B\u0B0C\u0B16\u0B17\u0B18\u0B1B\u0B1D\u0B20\u0B21\u0B22\u0B25\u0B26\u0B27\u0B2B\u0B2C\u0B2D\u0B36\u0B3D\u0B5C\u0B5D\u0B5F\u0B60\u0B61]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "ORIYA-TAMIL",
|
||||
"[:ORIYA:]", "[:TAMIL:]",
|
||||
|
@ -436,7 +436,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "KANNADA-ORIYA",
|
||||
"[:KANNADA:]", "[:ORIYA:]",
|
||||
"[\u0b3c\u0b57\u0B3D\u0B5C\u0B5D\u0B5F]", /*roundtrip exclusions*/
|
||||
"[\u0B01\u0b3c\u0b57\u0B3D\u0B5C\u0B5D\u0B5F]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "ORIYA-KANNADA",
|
||||
"[:ORIYA:]", "[:KANNADA:]",
|
||||
|
@ -445,7 +445,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "MALAYALAM-ORIYA",
|
||||
"[:MALAYALAM:]", "[:ORIYA:]",
|
||||
"[\u0b3c\u0b56\u0B3D\u0B5C\u0B5D\u0B5F]", /*roundtrip exclusions*/
|
||||
"[\u0B01\u0b3c\u0b56\u0B3D\u0B5C\u0B5D\u0B5F]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "ORIYA-MALAYALAM",
|
||||
"[:ORIYA:]", "[:MALAYALAM:]",
|
||||
|
@ -458,7 +458,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
},
|
||||
new String [] { "TAMIL-TELUGU",
|
||||
"[:TAMIL:]", "[:TELUGU:]",
|
||||
"[\u0c43\u0c44\u0c46\u0c47\u0c55\u0c56\u0c66\u0C0B\u0C0C\u0C16\u0C17\u0C18\u0C1B\u0C1D\u0C20\u0C21\u0C22\u0C25\u0C26\u0C27\u0C2B\u0C2C\u0C2D\u0C36\u0C60\u0C61]", /*roundtrip exclusions*/
|
||||
"[\u0C01\u0c43\u0c44\u0c46\u0c47\u0c55\u0c56\u0c66\u0C0B\u0C0C\u0C16\u0C17\u0C18\u0C1B\u0C1D\u0C20\u0C21\u0C22\u0C25\u0C26\u0C27\u0C2B\u0C2C\u0C2D\u0C36\u0C60\u0C61]", /*roundtrip exclusions*/
|
||||
},
|
||||
|
||||
new String [] { "KANNADA-TAMIL",
|
||||
|
@ -481,7 +481,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "KANNADA-TELUGU",
|
||||
"[:KANNADA:]", "[:TELUGU:]",
|
||||
"[\u0c3f\u0c46\u0c48\u0c4a]", /*roundtrip exclusions*/
|
||||
"[\u0C01\u0c3f\u0c46\u0c48\u0c4a]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "TELUGU-KANNADA",
|
||||
"[:TELUGU:]", "[:KANNADA:]",
|
||||
|
@ -490,7 +490,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
new String [] { "MALAYALAM-TELUGU",
|
||||
"[:MALAYALAM:]", "[:TELUGU:]",
|
||||
"[\u0c44\u0c4a\u0c4c\u0c4b\u0c55\u0c56]", /*roundtrip exclusions*/
|
||||
"[\u0C01\u0c44\u0c4a\u0c4c\u0c4b\u0c55\u0c56]", /*roundtrip exclusions*/
|
||||
},
|
||||
new String [] { "TELUGU-MALAYALAM",
|
||||
"[:TELUGU:]", "[:MALAYALAM:]",
|
||||
|
@ -566,7 +566,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
public boolean is(String sourceString) {
|
||||
try {
|
||||
int t;
|
||||
String decomp = Normalizer.normalize(sourceString, Normalizer.DECOMP, 0);
|
||||
String decomp = Normalizer.normalize(sourceString, Normalizer.NFD);
|
||||
for (int i = 0; i < decomp.length(); ++i) { // don't worry about surrogates
|
||||
switch (getType(decomp.charAt(i))) {
|
||||
case 0:
|
||||
|
@ -619,11 +619,11 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
public boolean is(String sourceString) {
|
||||
try {
|
||||
String decomp = Normalizer.normalize(sourceString, Normalizer.DECOMP, 0);
|
||||
String decomp = Normalizer.normalize(sourceString, Normalizer.NFD);
|
||||
|
||||
// modern is simpler: don't care about anything but a grave
|
||||
if (!full) {
|
||||
if (sourceString.equals("\u039C\u03C0")) return false;
|
||||
//if (sourceString.equals("\u039C\u03C0")) return false;
|
||||
for (int i = 0; i < decomp.length(); ++i) {
|
||||
char c = decomp.charAt(i);
|
||||
// exclude all the accents
|
||||
|
@ -714,8 +714,8 @@ public class RoundTripTest extends TestFmwk {
|
|||
public static boolean isSame(String a, String b) {
|
||||
if (a.equals(b)) return true;
|
||||
if (a.equalsIgnoreCase(b) && isCamel(a)) return true;
|
||||
a = Normalizer.normalize(a, Normalizer.DECOMP, 0);
|
||||
b = Normalizer.normalize(b, Normalizer.DECOMP, 0);
|
||||
a = Normalizer.normalize(a, Normalizer.NFD);
|
||||
b = Normalizer.normalize(b, Normalizer.NFD);
|
||||
if (a.equals(b)) return true;
|
||||
if (a.equalsIgnoreCase(b) && isCamel(a)) return true;
|
||||
return false;
|
||||
|
@ -925,7 +925,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
String targ = sourceToTarget.transliterate(cs);
|
||||
if (!toTarget.containsAll(targ)
|
||||
|| badCharacters.containsSome(targ)) {
|
||||
String targD = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
|
||||
String targD = Normalizer.normalize(targ, Normalizer.NFD);
|
||||
if (!toTarget.containsAll(targD)
|
||||
|| badCharacters.containsSome(targD)) {
|
||||
logWrongScript("Source-Target", cs, targ);
|
||||
|
@ -934,7 +934,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
String cs2 = Normalizer.normalize(cs, Normalizer.DECOMP, 0);
|
||||
String cs2 = Normalizer.normalize(cs, Normalizer.NFD);
|
||||
String targ2 = sourceToTarget.transliterate(cs2);
|
||||
if (!targ.equals(targ2)) {
|
||||
logNotCanonical("Source-Target", cs, targ, cs2, targ2);
|
||||
|
@ -978,14 +978,14 @@ public class RoundTripTest extends TestFmwk {
|
|||
String targ = sourceToTarget.transliterate(cs);
|
||||
if (!toTarget.containsAll(targ)
|
||||
|| badCharacters.containsSome(targ)) {
|
||||
String targD = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
|
||||
String targD = Normalizer.normalize(targ, Normalizer.NFD);
|
||||
if (!toTarget.containsAll(targD)
|
||||
|| badCharacters.containsSome(targD)) {
|
||||
logWrongScript("Source-Target", cs, targ);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
String cs2 = Normalizer.normalize(cs, Normalizer.DECOMP, 0);
|
||||
String cs2 = Normalizer.normalize(cs, Normalizer.NFD);
|
||||
String targ2 = sourceToTarget.transliterate(cs2);
|
||||
if (!targ.equals(targ2)) {
|
||||
logNotCanonical("Source-Target", cs, targ, cs2, targ2);
|
||||
|
@ -1005,28 +1005,36 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
usi.reset(targetRange);
|
||||
while (usi.next()) {
|
||||
int c = usi.codepoint;
|
||||
String cs;
|
||||
int c;
|
||||
if(usi.codepoint == usi.IS_STRING){
|
||||
cs = usi.string;
|
||||
c = UTF16.charAt(cs,0);
|
||||
}else{
|
||||
c = usi.codepoint;
|
||||
cs =UTF16.valueOf(c);
|
||||
}
|
||||
|
||||
String cs = UTF16.valueOf(c);
|
||||
String targ = targetToSource.transliterate(cs);
|
||||
String reverse = sourceToTarget.transliterate(targ);
|
||||
|
||||
if (!toSource.containsAll(targ)
|
||||
|| badCharacters.containsSome(targ)) {
|
||||
String targD = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
|
||||
String targD = Normalizer.normalize(targ, Normalizer.NFD);
|
||||
if (!toSource.containsAll(targD)
|
||||
|| badCharacters.containsSome(targD)) {
|
||||
logWrongScript("Target-Source", cs, targ);
|
||||
failTargSource.add(c);
|
||||
failTargSource.add(cs);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (!isSame(cs, reverse) && !roundtripExclusions.contains(c)) {
|
||||
if (!isSame(cs, reverse) && !roundtripExclusions.contains(c)
|
||||
&& !roundtripExclusions.contains(cs)) {
|
||||
logRoundTripFailure(cs,targetToSource.getID(), targ,sourceToTarget.getID(), reverse);
|
||||
failRound.add(c);
|
||||
continue;
|
||||
}
|
||||
String targ2 = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
|
||||
String targ2 = Normalizer.normalize(targ, Normalizer.NFD);
|
||||
String reverse2 = sourceToTarget.transliterate(targ2);
|
||||
if (!reverse.equals(reverse2)) {
|
||||
logNotCanonical("Target-Source", targ, reverse, targ2, reverse2);
|
||||
|
@ -1076,7 +1084,7 @@ public class RoundTripTest extends TestFmwk {
|
|||
|
||||
if (!toSource.containsAll(targ) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/
|
||||
|| badCharacters.containsSome(targ)) {
|
||||
String targD = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
|
||||
String targD = Normalizer.normalize(targ, Normalizer.NFD);
|
||||
if (!toSource.containsAll(targD) /*&& !failTargSource.contains(c) && !failTargSource.contains(d)*/
|
||||
|| badCharacters.containsSome(targD)) {
|
||||
logWrongScript("Target-Source", cs, targ);
|
||||
|
@ -1084,11 +1092,13 @@ public class RoundTripTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
if (!isSame(cs, reverse) /*&& !failRound.contains(c) && !failRound.contains(d)*/
|
||||
&& !roundtripExclusions.contains(c) && !roundtripExclusions.contains(d)) {
|
||||
&& !roundtripExclusions.contains(c)
|
||||
&& !roundtripExclusions.contains(d)
|
||||
&& !roundtripExclusions.contains(cs)) {
|
||||
logRoundTripFailure(cs,targetToSource.getID(), targ,sourceToTarget.getID(), reverse);
|
||||
continue;
|
||||
}
|
||||
String targ2 = Normalizer.normalize(targ, Normalizer.DECOMP, 0);
|
||||
String targ2 = Normalizer.normalize(targ, Normalizer.NFD);
|
||||
String reverse2 = sourceToTarget.transliterate(targ2);
|
||||
if (!reverse.equals(reverse2)) {
|
||||
logNotCanonical("Target-Source", targ, reverse, targ2, reverse2);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||
* $Date: 2002/06/12 17:37:10 $
|
||||
* $Revision: 1.106 $
|
||||
* $Date: 2002/06/20 01:16:48 $
|
||||
* $Revision: 1.107 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -2313,10 +2313,10 @@ public class TransliteratorTest extends TestFmwk {
|
|||
// Automatic generation of targets, to make it simpler to add test cases (and more fail-safe)
|
||||
|
||||
if (testCases[i].length > 2) target = testCases[i][2];
|
||||
else if (id.equalsIgnoreCase("NFD")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.DECOMP,0);
|
||||
else if (id.equalsIgnoreCase("NFC")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.COMPOSE,0);
|
||||
else if (id.equalsIgnoreCase("NFKD")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.DECOMP_COMPAT,0);
|
||||
else if (id.equalsIgnoreCase("NFKC")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.COMPOSE_COMPAT,0);
|
||||
else if (id.equalsIgnoreCase("NFD")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.NFD);
|
||||
else if (id.equalsIgnoreCase("NFC")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.NFC);
|
||||
else if (id.equalsIgnoreCase("NFKD")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.NFKD);
|
||||
else if (id.equalsIgnoreCase("NFKC")) target = com.ibm.icu.text.Normalizer.normalize(source, com.ibm.icu.text.Normalizer.NFKC);
|
||||
else if (id.equalsIgnoreCase("Lower")) target = UCharacter.toLowerCase(Locale.US, source);
|
||||
else if (id.equalsIgnoreCase("Upper")) target = UCharacter.toUpperCase(Locale.US, source);
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/WriteCharts.java,v $
|
||||
* $Date: 2002/03/13 19:52:34 $
|
||||
* $Revision: 1.14 $
|
||||
* $Date: 2002/06/20 01:16:48 $
|
||||
* $Revision: 1.15 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -198,7 +198,7 @@ public class WriteCharts {
|
|||
group |= 16;
|
||||
}
|
||||
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.DECOMP_COMPAT, 0))
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ss, Normalizer.NFKD))
|
||||
+ "\u0000" + ss,
|
||||
"<td class='s'>" + ss + "<br><tt>" + hex(ss)
|
||||
+ "</tt></td><td class='t'>" + ts + "<br><tt>" + hex(ts)
|
||||
|
@ -262,7 +262,7 @@ public class WriteCharts {
|
|||
group |= 16;
|
||||
}
|
||||
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.DECOMP_COMPAT, 0)) + ts,
|
||||
map.put(group + UCharacter.toLowerCase(Normalizer.normalize(ts, Normalizer.NFKD)) + ts,
|
||||
"<td class='s'>-</td><td class='t'>" + ts + "<br><tt>" + hex(ts)
|
||||
+ "</tt></td><td class='r'>"
|
||||
+ rt + "<br><tt>" + hex(rt) + "</tt></td>");
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/localeconverter/ConvertPOSIXLocale.java,v $
|
||||
* $Date: 2002/02/16 03:05:27 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/06/20 01:17:11 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -223,11 +223,11 @@ public class ConvertPOSIXLocale {
|
|||
process(args);
|
||||
//{{INIT_CONTROLS
|
||||
//}}
|
||||
}
|
||||
}
|
||||
|
||||
public void process(String args[]) {
|
||||
short options = identifyOptions(args);
|
||||
String enc="";
|
||||
String enc=null;
|
||||
if ((args.length < 2) || ((options & OPT_UNKNOWN) != 0)) {
|
||||
printUsage();
|
||||
} else {
|
||||
|
@ -249,6 +249,9 @@ public class ConvertPOSIXLocale {
|
|||
}
|
||||
|
||||
}
|
||||
if(enc==null){
|
||||
enc="Default";
|
||||
}
|
||||
if ((fileName == null) || (locale == null) || (options == 0)) {
|
||||
printUsage();
|
||||
} else {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/localeconverter/ICU2LocaleWriter.java,v $
|
||||
* $Date: 2002/02/16 03:05:28 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/06/20 01:17:12 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -58,17 +58,20 @@ public class ICU2LocaleWriter extends LocaleWriter {
|
|||
super.write(tag, o);
|
||||
} else {
|
||||
CollationItem[] items = (CollationItem[])o;
|
||||
print("CollationElements");
|
||||
println(" { ");
|
||||
for (int i = 0; i < items.length; i++) {
|
||||
if(items[i]!=null){
|
||||
printString(items[i].toString());
|
||||
if (items[i].comment != null) {
|
||||
tabTo(30);
|
||||
print("//");
|
||||
println(items[i].comment);
|
||||
if(items[0]!=null){
|
||||
print("Sequence");
|
||||
println(" { ");
|
||||
for (int i = 0; i < items.length; i++) {
|
||||
if(items[i]!=null){
|
||||
printString(items[i].toString());
|
||||
if (items[i].comment != null) {
|
||||
tabTo(30);
|
||||
print("//");
|
||||
println(items[i].comment);
|
||||
}
|
||||
}
|
||||
}
|
||||
println("}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/localeconverter/ICULocaleWriter.java,v $
|
||||
* $Date: 2002/02/16 03:05:28 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/06/20 01:17:12 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -38,17 +38,20 @@ public class ICULocaleWriter extends LocaleWriter {
|
|||
super.write(tag, o);
|
||||
} else {
|
||||
CollationItem[] items = (CollationItem[])o;
|
||||
print("CollationElements");
|
||||
println(" { ");
|
||||
for (int i = 0; i < items.length; i++) {
|
||||
if(items[i]!=null){
|
||||
printString(items[i].toString());
|
||||
if (items[i].comment != null) {
|
||||
tabTo(30);
|
||||
print("//");
|
||||
println(items[i].comment);
|
||||
}
|
||||
}
|
||||
if(items[0]!=null){
|
||||
print("Sequence");
|
||||
println(" { ");
|
||||
for (int i = 0; i < items.length; i++) {
|
||||
if(items[i]!=null){
|
||||
printString(items[i].toString());
|
||||
if (items[i].comment != null) {
|
||||
tabTo(30);
|
||||
print("//");
|
||||
println(items[i].comment);
|
||||
}
|
||||
}
|
||||
}
|
||||
println("}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/localeconverter/PosixCharMap.java,v $
|
||||
* $Date: 2002/02/16 03:05:30 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/06/20 01:17:12 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -62,6 +62,139 @@ public class PosixCharMap {
|
|||
encoding =enc;
|
||||
load(new BufferedReader(new FileReader(file)));
|
||||
}
|
||||
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
|
||||
static private final char[] UNESCAPE_MAP = {
|
||||
/*" 0x22, 0x22 */
|
||||
/*' 0x27, 0x27 */
|
||||
/*? 0x3F, 0x3F */
|
||||
/*\ 0x5C, 0x5C */
|
||||
/*a*/ 0x61, 0x07,
|
||||
/*b*/ 0x62, 0x08,
|
||||
/*f*/ 0x66, 0x0c,
|
||||
/*n*/ 0x6E, 0x0a,
|
||||
/*r*/ 0x72, 0x0d,
|
||||
/*t*/ 0x74, 0x09,
|
||||
/*v*/ 0x76, 0x0b
|
||||
};
|
||||
/**
|
||||
* Convert an escape to a 32-bit code point value. We attempt
|
||||
* to parallel the icu4c unesacpeAt() function.
|
||||
* @param offset16 an array containing offset to the character
|
||||
* <em>after</em> the backslash. Upon return offset16[0] will
|
||||
* be updated to point after the escape sequence.
|
||||
* @return character value from 0 to 10FFFF, or -1 on error.
|
||||
*/
|
||||
public static int unescapeAt(String s, int[] offset16) {
|
||||
int c;
|
||||
int result = 0;
|
||||
int n = 0;
|
||||
int minDig = 0;
|
||||
int maxDig = 0;
|
||||
int bitsPerDigit = 4;
|
||||
int dig;
|
||||
int i;
|
||||
|
||||
/* Check that offset is in range */
|
||||
int offset = offset16[0];
|
||||
int length = s.length();
|
||||
if (offset < 0 || offset >= length) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Fetch first UChar after '\\' */
|
||||
c = UTF16.charAt(s, offset);
|
||||
offset += UTF16.getCharCount(c);
|
||||
|
||||
/* Convert hexadecimal and octal escapes */
|
||||
switch (c) {
|
||||
case 'u':
|
||||
minDig = maxDig = 4;
|
||||
break;
|
||||
case 'U':
|
||||
minDig = maxDig = 8;
|
||||
break;
|
||||
case 'x':
|
||||
minDig = 1;
|
||||
maxDig = 2;
|
||||
break;
|
||||
default:
|
||||
dig = UCharacter.digit(c, 8);
|
||||
if (dig >= 0) {
|
||||
minDig = 1;
|
||||
maxDig = 3;
|
||||
n = 1; /* Already have first octal digit */
|
||||
bitsPerDigit = 3;
|
||||
result = dig;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (minDig != 0) {
|
||||
while (offset < length && n < maxDig) {
|
||||
// TEMPORARY
|
||||
// TODO: Restore the char32-based code when UCharacter.digit
|
||||
// is working (Bug 66).
|
||||
|
||||
//c = UTF16.charAt(s, offset);
|
||||
//dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
c = s.charAt(offset);
|
||||
dig = Character.digit((char)c, (bitsPerDigit == 3) ? 8 : 16);
|
||||
if (dig < 0) {
|
||||
break;
|
||||
}
|
||||
result = (result << bitsPerDigit) | dig;
|
||||
//offset += UTF16.getCharCount(c);
|
||||
++offset;
|
||||
++n;
|
||||
}
|
||||
if (n < minDig) {
|
||||
return -1;
|
||||
}
|
||||
offset16[0] = offset;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Convert C-style escapes in table */
|
||||
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
|
||||
if (c == UNESCAPE_MAP[i]) {
|
||||
offset16[0] = offset;
|
||||
return UNESCAPE_MAP[i+1];
|
||||
} else if (c < UNESCAPE_MAP[i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* If no special forms are recognized, then consider
|
||||
* the backslash to generically escape the next character. */
|
||||
offset16[0] = offset;
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert all escapes in a given string using unescapeAt().
|
||||
* @exception IllegalArgumentException if an invalid escape is
|
||||
* seen.
|
||||
*/
|
||||
public static String unescape(String s) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
int[] pos = new int[1];
|
||||
for (int i=0; i<s.length(); ) {
|
||||
char c = s.charAt(i++);
|
||||
if (c == '\\') {
|
||||
pos[0] = i;
|
||||
int e = unescapeAt(s, pos);
|
||||
if (e < 0) {
|
||||
throw new IllegalArgumentException("Invalid escape sequence " +
|
||||
s.substring(i-1, Math.min(i+8, s.length())));
|
||||
}
|
||||
UTF16.append(buf, e);
|
||||
i = pos[0];
|
||||
} else {
|
||||
buf.append(c);
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
public void load(Reader inputReader) throws IOException {
|
||||
PosixCharMap oldMap = SymbolTransition.getCharMap();
|
||||
SymbolTransition.setCharMap(null);
|
||||
|
@ -104,14 +237,21 @@ public class PosixCharMap {
|
|||
state = p.nextToken();
|
||||
} while ((state != EOF) && !p.dataEquals("CHARMAP"));
|
||||
p.accept(EOL);
|
||||
if (state != EOF) {
|
||||
if (state != EOF ) {
|
||||
p = new Lex(states2, input);
|
||||
state = p.nextToken();
|
||||
while (state != EOF) {
|
||||
while (state != EOF ) {
|
||||
|
||||
String key = p.getData();
|
||||
if(p.dataEquals("ENDCHARMAP")){
|
||||
break;
|
||||
}
|
||||
state = p.nextToken();
|
||||
while (state == EOL) {
|
||||
String data = p.getData();
|
||||
if(p.dataEquals("ENDCHARMAP")){
|
||||
break;
|
||||
}
|
||||
String data = unescape(p.getData());
|
||||
data.trim();
|
||||
if (data.startsWith("<U") || data.startsWith("#U")) {
|
||||
String numData = data.substring(2,data.length()-1);
|
||||
|
@ -154,8 +294,7 @@ public class PosixCharMap {
|
|||
|
||||
state = p.nextToken();
|
||||
key=p.getData();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//state = p.nextToken();
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/translit/UnicodeSetClosure.java,v $
|
||||
* $Date: 2002/02/25 22:43:59 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2002/06/20 01:17:39 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -95,7 +95,7 @@ public class UnicodeSetClosure {
|
|||
}
|
||||
|
||||
static final Normalizer.Mode[] testModes = {
|
||||
Normalizer.NO_OP, Normalizer.DECOMP, Normalizer.COMPOSE, Normalizer.DECOMP_COMPAT, Normalizer.COMPOSE_COMPAT};
|
||||
Normalizer.NONE, Normalizer.NFD, Normalizer.NFC, Normalizer.NFKD, Normalizer.NFKC};
|
||||
static final String[] modeNames = {
|
||||
"NoNF", "NFD", "NFC", "NFKD", "NFKC"};
|
||||
|
||||
|
@ -197,7 +197,7 @@ public class UnicodeSetClosure {
|
|||
String source = UTF16.valueOf(cp);
|
||||
String result = source;
|
||||
if (lowerFirst) result = UCharacter.toLowerCase(Locale.US, result);
|
||||
result = Normalizer.normalize(result, mode, 0);
|
||||
result = Normalizer.normalize(result, mode);
|
||||
if (lowerFirst) result = UCharacter.toLowerCase(Locale.US, result);
|
||||
if (result.equals(source)) return null;
|
||||
return result;
|
||||
|
|
|
@ -31,13 +31,13 @@ import java.io.*;
|
|||
public class genIndexFilters {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
Normalizer.Mode m = Normalizer.NO_OP;
|
||||
Normalizer.Mode m = Normalizer.NONE;
|
||||
boolean lowerFirst = false;
|
||||
if (args.length >= 2) {
|
||||
if (args[1].equalsIgnoreCase("NFD")) {
|
||||
m = Normalizer.DECOMP;
|
||||
m = Normalizer.NFD;
|
||||
} else if (args[1].equalsIgnoreCase("NFKD")) {
|
||||
m = Normalizer.DECOMP_COMPAT;
|
||||
m = Normalizer.NFKD;
|
||||
} else {
|
||||
usage();
|
||||
}
|
||||
|
@ -59,7 +59,7 @@ public class genIndexFilters {
|
|||
Transliterator t = Transliterator.getInstance(ID);
|
||||
// TransliteratorUtility gives us access to package private API
|
||||
UnicodeSet sourceSet = TransliteratorUtility.getSourceSet(t);
|
||||
if (m != Normalizer.NO_OP || lowerFirst) {
|
||||
if (m != Normalizer.NONE || lowerFirst) {
|
||||
UnicodeSetClosure.close(sourceSet, m, lowerFirst);
|
||||
}
|
||||
System.out.println(sourceSet.toPattern(true));
|
||||
|
|
157
icu4j/src/com/ibm/icu/impl/ICUCharacterIterator.java
Normal file
157
icu4j/src/com/ibm/icu/impl/ICUCharacterIterator.java
Normal file
|
@ -0,0 +1,157 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2000, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/ICUCharacterIterator.java,v $
|
||||
* $Date: 2002/06/20 01:18:07 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
public class ICUCharacterIterator extends UCharacterIterator {
|
||||
|
||||
private CharacterIterator iterator;
|
||||
|
||||
/**
|
||||
* Current index
|
||||
*/
|
||||
private int currentIndex;
|
||||
|
||||
/**
|
||||
* length
|
||||
*/
|
||||
private int length;
|
||||
|
||||
/**
|
||||
* cache of begin offset in character iterator
|
||||
*/
|
||||
private int beginIndex;
|
||||
|
||||
public ICUCharacterIterator(CharacterIterator iter){
|
||||
if(iter==null){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
iterator = iter;
|
||||
currentIndex = 0;
|
||||
beginIndex = iter.getBeginIndex();
|
||||
length = iter.getEndIndex() - beginIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#current()
|
||||
*/
|
||||
public int current() {
|
||||
if (currentIndex < length) {
|
||||
return iterator.setIndex(beginIndex + currentIndex);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getLength()
|
||||
*/
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getIndex()
|
||||
*/
|
||||
public int getIndex() {
|
||||
return currentIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#next()
|
||||
*/
|
||||
public int next() {
|
||||
if(currentIndex < length){
|
||||
return iterator.setIndex(beginIndex + currentIndex++);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#previous()
|
||||
*/
|
||||
public int previous() {
|
||||
if(currentIndex>0){
|
||||
return iterator.setIndex(beginIndex + --currentIndex);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#setIndex(int)
|
||||
*/
|
||||
public void setIndex(int index) {
|
||||
if (index < 0 || index > length) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
currentIndex = index;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#setToLimit()
|
||||
*/
|
||||
public void setToLimit() {
|
||||
currentIndex = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getText(char[])
|
||||
*/
|
||||
public int getText(char[] fillIn, int offset){
|
||||
if(offset < 0 || offset + length > fillIn.length){
|
||||
throw new IndexOutOfBoundsException(Integer.toString(length));
|
||||
}
|
||||
|
||||
for (char ch = iterator.first(); ch != iterator.DONE; ch = iterator.next()) {
|
||||
fillIn[offset++] = ch;
|
||||
}
|
||||
iterator.setIndex(beginIndex + currentIndex);
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a clone of this iterator. Clones the underlying character iterator.
|
||||
* @see UCharacterIterator#clone()
|
||||
*/
|
||||
public Object clone(){
|
||||
try {
|
||||
ICUCharacterIterator result = (ICUCharacterIterator) super.clone();
|
||||
result.iterator = (CharacterIterator)this.iterator.clone();
|
||||
return result;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
return null; // only invoked if bad underlying character iterator
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#moveIndex()
|
||||
*/
|
||||
public int moveIndex(int index){
|
||||
currentIndex += index;
|
||||
|
||||
if(currentIndex < 0) {
|
||||
currentIndex = 0;
|
||||
} else if(currentIndex > length) {
|
||||
currentIndex = length;
|
||||
}
|
||||
return currentIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see UCharacterIterator#getCharacterIterator()
|
||||
*/
|
||||
public CharacterIterator getCharacterIterator(){
|
||||
return (CharacterIterator)iterator.clone();
|
||||
}
|
||||
}
|
|
@ -5,14 +5,15 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/NormalizerDataReader.java,v $
|
||||
* $Date: 2002/03/28 01:50:59 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2002/06/20 01:18:07 $
|
||||
* $Revision: 1.4 $
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
import java.io.*;
|
||||
import com.ibm.icu.impl.ICUDebug;
|
||||
import com.ibm.icu.impl.ICUDebug;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
/**
|
||||
* @version 1.0
|
||||
* @author Ram Viswanadha
|
||||
|
@ -288,8 +289,8 @@ final class NormalizerDataReader {
|
|||
throws IOException{
|
||||
if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
|
||||
|
||||
ICUBinary.readHeader(inputStream, DATA_FORMAT_ID_,
|
||||
DATA_FORMAT_VERSION_, UNICODE_VERSION_);
|
||||
ICUBinary.readHeader(inputStream, DATA_FORMAT_ID,
|
||||
DATA_FORMAT_VERSION, UNICODE_VERSION);
|
||||
|
||||
if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
|
||||
|
||||
|
@ -299,61 +300,63 @@ final class NormalizerDataReader {
|
|||
}
|
||||
|
||||
// protected methods -------------------------------------------------
|
||||
|
||||
|
||||
protected int[] readIndexes(int length)throws IOException{
|
||||
int[] indexes = new int[length];
|
||||
//Read the indexes
|
||||
for (int i = 0; i <length ; i++) {
|
||||
indexes[i] = dataInputStream.readInt();
|
||||
}
|
||||
return indexes;
|
||||
}
|
||||
/**
|
||||
* <p>Reads uprops.dat, parse it into blocks of data to be stored in
|
||||
* NormalizerImpl.</P
|
||||
* @param impl NormalizerImpl instance
|
||||
* @param normBytes
|
||||
* @param fcdBytes
|
||||
* @param auxBytes
|
||||
* @param extraData
|
||||
* @param combiningTable
|
||||
* @param canonStartSets
|
||||
* @exception thrown when data reading fails
|
||||
* @draft 2.1
|
||||
*/
|
||||
protected void read(NormalizerImpl impl)
|
||||
throws IOException{
|
||||
|
||||
//Read the indexes
|
||||
int[] indexes = new int[NormalizerImpl.INDEX_TOP];
|
||||
for (int i = 0; i <indexes.length ; i++) {
|
||||
indexes[i] = dataInputStream.readInt();
|
||||
}
|
||||
|
||||
|
||||
//Read the bytes that make up the normTrie
|
||||
byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
|
||||
protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
|
||||
char[] extraData, char[] combiningTable,
|
||||
Object[] canonStartSets)
|
||||
throws IOException{
|
||||
|
||||
//Read the bytes that make up the normTrie
|
||||
dataInputStream.read(normBytes);
|
||||
ByteArrayInputStream normTrieStream= new ByteArrayInputStream(normBytes);
|
||||
|
||||
//normTrieStream= new ByteArrayInputStream(normBytes);
|
||||
|
||||
//Read the extra data
|
||||
int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
|
||||
char[] extraData = new char[extraDataTop];
|
||||
for(int i=0;i<extraDataTop;i++){
|
||||
for(int i=0;i<extraData.length;i++){
|
||||
extraData[i]=dataInputStream.readChar();
|
||||
}
|
||||
|
||||
//Read the combining class table
|
||||
int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
|
||||
char[] combiningTable = new char[combiningTableTop];
|
||||
for(int i=0; i<combiningTableTop; i++){
|
||||
for(int i=0; i<combiningTable.length; i++){
|
||||
combiningTable[i]=dataInputStream.readChar();
|
||||
}
|
||||
|
||||
//Read the fcdTrie
|
||||
byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
|
||||
dataInputStream.read(fcdBytes);
|
||||
ByteArrayInputStream fcdTrieStream= new ByteArrayInputStream(fcdBytes);
|
||||
|
||||
|
||||
//Read the AuxTrie
|
||||
byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
|
||||
//Read the AuxTrie
|
||||
dataInputStream.read(auxBytes);
|
||||
ByteArrayInputStream auxTrieStream= new ByteArrayInputStream(auxBytes);
|
||||
|
||||
//Read the canonical start sets
|
||||
Object[] canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
|
||||
int[] canonStartSetsIndexes = new int[NormalizerImpl.SET_INDEX_TOP];
|
||||
for(int i=0; i<canonStartSetsIndexes.length; i++){
|
||||
|
||||
for(int i=0; i<canonStartSetsIndexes.length; i++){
|
||||
canonStartSetsIndexes[i]=dataInputStream.readChar();
|
||||
}
|
||||
char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH]-NormalizerImpl.SET_INDEX_TOP];
|
||||
|
||||
char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH]-NormalizerImpl.SET_INDEX_TOP];
|
||||
|
||||
for(int i=0; i<startSets.length; i++){
|
||||
startSets[i]=dataInputStream.readChar();
|
||||
}
|
||||
|
@ -369,20 +372,11 @@ final class NormalizerDataReader {
|
|||
canonStartSets[NormalizerImpl.CANON_SET_START_SETS_INDEX] = startSets;
|
||||
canonStartSets[NormalizerImpl.CANON_SET_BMP_TABLE_INDEX ] = bmpTable;
|
||||
canonStartSets[NormalizerImpl.CANON_SET_SUPP_TABLE_INDEX] = suppTable;
|
||||
|
||||
//Now set the tries
|
||||
impl.normTrieImpl.normTrie = new IntTrie( normTrieStream,impl.normTrieImpl );
|
||||
impl.fcdTrieImpl.fcdTrie = new CharTrie(fcdTrieStream,impl.fcdTrieImpl );
|
||||
impl.auxTrieImpl.auxTrie = new CharTrie( auxTrieStream, impl.auxTrieImpl );
|
||||
impl.indexes = indexes;
|
||||
impl.extraData = extraData;
|
||||
impl.combiningTable = combiningTable;
|
||||
impl.isDataLoaded = true;
|
||||
impl.canonStartSets = canonStartSets;
|
||||
impl.isFormatVersion_2_1 = DATA_FORMAT_VERSION_[0]>2 || (DATA_FORMAT_VERSION_[0]==2 && DATA_FORMAT_VERSION_[1]>=1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public byte[] getDataFormatVersion(){
|
||||
return DATA_FORMAT_VERSION;
|
||||
}
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
|
||||
|
@ -396,13 +390,13 @@ final class NormalizerDataReader {
|
|||
* No guarantees are made if a older version is used
|
||||
* see store.c of gennorm for more information and values
|
||||
*/
|
||||
private static final byte DATA_FORMAT_ID_[] = {(byte)0x4E, (byte)0x6F,
|
||||
private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
|
||||
(byte)0x72, (byte)0x6D};
|
||||
private static final byte DATA_FORMAT_VERSION_[] = {(byte)0x2, (byte)0x1,
|
||||
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x1,
|
||||
(byte)0x5, (byte)0x2};
|
||||
//TODO: Set the version info after the VersionInfo class is ported
|
||||
private static final byte UNICODE_VERSION_[] = {(byte)0x3, (byte)0x1,
|
||||
(byte)0x1, (byte)0x0};
|
||||
private static final String UNICODE_VERSION_STRING_ = "3.1.1.0";
|
||||
private static final byte UNICODE_VERSION[] = {(byte)0x3, (byte)0x2,
|
||||
(byte)0x0, (byte)0x0};
|
||||
private static final String UNICODE_VERSION_STRING = "3.2.0.0";
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
240
icu4j/src/com/ibm/icu/impl/ReplaceableCharacterIterator.java
Normal file
240
icu4j/src/com/ibm/icu/impl/ReplaceableCharacterIterator.java
Normal file
|
@ -0,0 +1,240 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2000, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/ReplaceableCharacterIterator.java,v $
|
||||
* $Date: 2002/06/20 01:18:09 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.ReplaceableString;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
/**
|
||||
* DLF docs must define behavior when Replaceable is mutated underneath
|
||||
* the iterator.
|
||||
*
|
||||
* This and ICUCharacterIterator share some code, maybe they should share
|
||||
* an implementation, or the common state and implementation should be
|
||||
* moved up into UCharacterIterator.
|
||||
*
|
||||
* What are first, last, and getBeginIndex doing here?!?!?!
|
||||
*/
|
||||
public class ReplaceableCharacterIterator extends UCharacterIterator {
|
||||
|
||||
// public constructor ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
* @param replacable text which the iterator will be based on
|
||||
*/
|
||||
public ReplaceableCharacterIterator(Replaceable replaceable){
|
||||
if(replaceable==null){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.replaceable = replaceable;
|
||||
this.currentIndex = 0;
|
||||
this.length = replaceable.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
* @param str text which the iterator will be based on
|
||||
*/
|
||||
public ReplaceableCharacterIterator(String str){
|
||||
if(str==null){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.replaceable = new ReplaceableString(str);
|
||||
this.currentIndex = 0;
|
||||
this.length = replaceable.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
* @param src an array of characters on which the iterator will be based
|
||||
*/
|
||||
public ReplaceableCharacterIterator(char[] src){
|
||||
if(src==null){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.replaceable = new ReplaceableString(new String(src));
|
||||
this.currentIndex = 0;
|
||||
this.length = replaceable.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
* @param buf buffer of text on which the iterator will be based
|
||||
*/
|
||||
public ReplaceableCharacterIterator(StringBuffer buf){
|
||||
if(buf==null){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
this.replaceable = new ReplaceableString(buf);
|
||||
this.currentIndex = 0;
|
||||
this.length = replaceable.length();
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator, does not clone the underlying
|
||||
* <code>Replaceable</code>object
|
||||
* @return copy of this iterator
|
||||
*/
|
||||
public Object clone(){
|
||||
try {
|
||||
return super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
return null; // never invoked
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current UTF16 character.
|
||||
* @return current UTF16 character
|
||||
*/
|
||||
public int current(){
|
||||
if (currentIndex < length) {
|
||||
return replaceable.charAt(currentIndex);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current codepoint
|
||||
* @return current codepoint
|
||||
*/
|
||||
public int currentCodePoint(){
|
||||
// cannot use charAt due to it different
|
||||
// behaviour when index is pointing at a
|
||||
// trail surrogate, check for surrogates
|
||||
|
||||
int ch = current();
|
||||
if(UTF16.isLeadSurrogate((char)ch)){
|
||||
// advance the index to get the next code point
|
||||
next();
|
||||
// due to post increment semantics current() after next()
|
||||
// actually returns the next char which is what we want
|
||||
int ch2 = current();
|
||||
// current should never change the current index so back off
|
||||
previous();
|
||||
|
||||
if(UTF16.isTrailSurrogate((char)ch2)){
|
||||
// we found a surrogate pair
|
||||
return UCharacterProperty.getRawSupplementary(
|
||||
(char)ch,(char)ch2
|
||||
);
|
||||
}
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the start of the text.
|
||||
* @return 0
|
||||
*/
|
||||
public int getBeginIndex(){
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the text
|
||||
* @return length of the text
|
||||
*/
|
||||
public int getLength(){
|
||||
return length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current currentIndex in text.
|
||||
* @return current currentIndex in text.
|
||||
*/
|
||||
public int getIndex(){
|
||||
return currentIndex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns next UTF16 character and increments the iterator's currentIndex by 1.
|
||||
* If the resulting currentIndex is greater or equal to the text length, the
|
||||
* currentIndex is reset to the text length and a value of DONECODEPOINT is
|
||||
* returned.
|
||||
* @return next UTF16 character in text or DONE if the new currentIndex is off the
|
||||
* end of the text range.
|
||||
*/
|
||||
public int next(){
|
||||
if (currentIndex < length) {
|
||||
return replaceable.charAt(currentIndex++);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns previous UTF16 character and decrements the iterator's currentIndex by
|
||||
* 1.
|
||||
* If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a
|
||||
* value of DONECODEPOINT is returned.
|
||||
* @return next UTF16 character in text or DONE if the new currentIndex is off the
|
||||
* start of the text range.
|
||||
*/
|
||||
public int previous(){
|
||||
if (currentIndex > 0) {
|
||||
return replaceable.charAt(--currentIndex);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Sets the currentIndex to the specified currentIndex in the text and returns that
|
||||
* single UTF16 character at currentIndex.
|
||||
* This assumes the text is stored as 16-bit code units.</p>
|
||||
* @param currentIndex the currentIndex within the text.
|
||||
* @exception IllegalArgumentException is thrown if an invalid currentIndex is
|
||||
* supplied. i.e. currentIndex is out of bounds.
|
||||
* @return the character at the specified currentIndex or DONE if the specified
|
||||
* currentIndex is equal to the end of the text.
|
||||
*/
|
||||
public void setIndex(int currentIndex) throws IndexOutOfBoundsException{
|
||||
if (currentIndex < 0 || currentIndex > length) {
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
this.currentIndex = currentIndex;
|
||||
}
|
||||
|
||||
public int getText(char[] fillIn, int offset){
|
||||
if(offset < 0 || offset + length > fillIn.length){
|
||||
throw new IndexOutOfBoundsException(Integer.toString(length));
|
||||
}
|
||||
replaceable.getChars(0,length,fillIn,offset);
|
||||
return length;
|
||||
}
|
||||
|
||||
public String getString(){
|
||||
char[] arr = new char[length];
|
||||
replaceable.getChars(0,length,arr,0);
|
||||
return new String(arr);
|
||||
}
|
||||
|
||||
// private data members ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Replacable object
|
||||
*/
|
||||
private Replaceable replaceable;
|
||||
/**
|
||||
* Current currentIndex
|
||||
*/
|
||||
private int currentIndex;
|
||||
/**
|
||||
* Replaceable text length
|
||||
*/
|
||||
private int length;
|
||||
}
|
91
icu4j/src/com/ibm/icu/impl/UCharArrayIterator.java
Normal file
91
icu4j/src/com/ibm/icu/impl/UCharArrayIterator.java
Normal file
|
@ -0,0 +1,91 @@
|
|||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/UCharArrayIterator.java,v $
|
||||
* $Date: 2002/06/20 01:18:09 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
|
||||
/**
|
||||
* @author Doug Felt
|
||||
*
|
||||
*/
|
||||
|
||||
public final class UCharArrayIterator extends UCharacterIterator {
|
||||
private final char[] text;
|
||||
private final int start;
|
||||
private final int limit;
|
||||
private int pos;
|
||||
|
||||
public UCharArrayIterator(char[] text, int start, int limit) {
|
||||
if (start < 0 || limit > text.length || start > limit) {
|
||||
throw new IllegalArgumentException("start: " + start + " or limit: "
|
||||
+ limit + " out of range [0, "
|
||||
+ text.length + ")");
|
||||
}
|
||||
this.text = text;
|
||||
this.start = start;
|
||||
this.limit = limit;
|
||||
|
||||
this.pos = start;
|
||||
}
|
||||
|
||||
public int current() {
|
||||
return pos < limit ? text[pos] : DONE;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return limit - start;
|
||||
}
|
||||
|
||||
public int getIndex() {
|
||||
return pos - start;
|
||||
}
|
||||
|
||||
public int next() {
|
||||
return pos < limit ? text[pos++] : DONE;
|
||||
}
|
||||
|
||||
public int previous() {
|
||||
return pos > start ? text[--pos] : DONE;
|
||||
}
|
||||
|
||||
public void setIndex(int index) {
|
||||
if (index < 0 || index > limit - start) {
|
||||
throw new IndexOutOfBoundsException("index: " + index +
|
||||
" out of range [0, "
|
||||
+ (limit - start) + ")");
|
||||
}
|
||||
pos = start + index;
|
||||
}
|
||||
|
||||
public int getText(char[] fillIn, int offset) {
|
||||
int len = limit - start;
|
||||
System.arraycopy(text, start, fillIn, offset, len);
|
||||
return len;
|
||||
}
|
||||
|
||||
public String getString() {
|
||||
return new String(text, start, limit - start);
|
||||
}
|
||||
/**
|
||||
* Creates a copy of this iterator, does not clone the underlying
|
||||
* <code>Replaceable</code>object
|
||||
* @return copy of this iterator
|
||||
*/
|
||||
public Object clone(){
|
||||
try {
|
||||
return super.clone();
|
||||
} catch (CloneNotSupportedException e) {
|
||||
return null; // never invoked
|
||||
}
|
||||
}
|
||||
}
|
|
@ -5,335 +5,399 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/UCharacterIterator.java,v $
|
||||
* $Date: 2002/05/14 16:48:49 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2002/06/20 01:18:09 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.ReplaceableString;
|
||||
import com.ibm.icu.text.StringCharacterIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import com.ibm.icu.impl.UCharArrayIterator;
|
||||
|
||||
/**
|
||||
* Internal class that iterates through a com.ibm.text.Replacable text object
|
||||
* to return either Unicode characters.
|
||||
* @author synwee
|
||||
* @version release 2.1, February 2002
|
||||
* DLF- Docs mostly need 1) much more description of iteration behavior,
|
||||
* especially at endpoints and with empty or single character strings,
|
||||
* and 2) need to describe the other major difference with Java
|
||||
* CharacterIterator, which is that this also returns code points as
|
||||
* well as code units.
|
||||
*
|
||||
* Don't understand why setIndex and moveIndex have different exception behavior.
|
||||
* I expect they shouldn't.
|
||||
*/
|
||||
public final class UCharacterIterator implements CharacterIterator
|
||||
{
|
||||
// public data members -----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Indicator that we have reached the ends of the UTF16 text when returning
|
||||
* 16 bit character.
|
||||
*/
|
||||
public static final int DONE = 0xFFFF;
|
||||
/**
|
||||
* Indicator that we have reached the ends of the UTF16 text when returning
|
||||
* codepoints.
|
||||
*/
|
||||
public static final int DONE_CODEPOINT = -1;
|
||||
|
||||
// public constructor ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Public constructor.
|
||||
* By default the iteration range will be from 0 to the end of the text.
|
||||
* @param replacable text which the iterator will be based on
|
||||
*/
|
||||
public UCharacterIterator(Replaceable replaceable)
|
||||
{
|
||||
m_replaceable_ = replaceable;
|
||||
m_index_ = 0;
|
||||
m_start_ = 0;
|
||||
m_limit_ = replaceable.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
* By default the iteration range will be from 0 to the end of the text.
|
||||
* @param str text which the iterator will be based on
|
||||
*/
|
||||
public UCharacterIterator(String str)
|
||||
{
|
||||
m_replaceable_ = new ReplaceableString(str);
|
||||
m_index_ = 0;
|
||||
m_start_ = 0;
|
||||
m_limit_ = m_replaceable_.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an iterator over the given range of the given string.
|
||||
* @param text text to be iterated over
|
||||
* @param start offset of the first character to iterate
|
||||
* @param limit offset of the character following the last character to
|
||||
* iterate
|
||||
*/
|
||||
public UCharacterIterator(String str, int start, int limit)
|
||||
{
|
||||
m_replaceable_ = new ReplaceableString(str);
|
||||
m_start_ = start;
|
||||
m_limit_ = limit;
|
||||
m_index_ = m_start_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract class that defines an API for iteration on text objects.This is an
|
||||
* interface for forward and backward iteration and random access into a text
|
||||
* object. Forward iteration is done with post-increment and backward iteration
|
||||
* is done with pre-decrement semantics, while the
|
||||
* <code>java.text.CharacterIterator</code> interface methods provided forward
|
||||
* iteration with "pre-increment" and backward iteration with pre-decrement
|
||||
* semantics. This API is more efficient for forward iteration over code points.
|
||||
* @author Ram
|
||||
* @version release 2.2, May 2002
|
||||
*/
|
||||
public abstract class UCharacterIterator
|
||||
implements Cloneable,UForwardCharacterIterator {
|
||||
|
||||
|
||||
// static final methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Constructs an iterator over the given range of the given replaceable
|
||||
* string.
|
||||
* @param text text to be iterated over
|
||||
* @param start offset of the first character to iterate
|
||||
* @param limit offset of the character following the last character to
|
||||
* iterate
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* <code>Replaceable</code> object.
|
||||
* @param source a valid source as a <code>Replaceable</code> object
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
*/
|
||||
public UCharacterIterator(Replaceable replaceable, int start, int limit)
|
||||
{
|
||||
m_replaceable_ = replaceable;
|
||||
m_start_ = start;
|
||||
m_limit_ = limit;
|
||||
m_index_ = m_start_;
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator.
|
||||
* Cloning will not duplicate a new Replaceable object.
|
||||
* @return copy of this iterator
|
||||
*/
|
||||
public Object clone()
|
||||
{
|
||||
try {
|
||||
return super.clone();
|
||||
}
|
||||
catch (CloneNotSupportedException e) {
|
||||
throw new InternalError(
|
||||
"Cloning by the super class java.text.CharacterIterator is not " +
|
||||
"supported");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current UTF16 character.
|
||||
* @return current UTF16 character
|
||||
*/
|
||||
public char current()
|
||||
{
|
||||
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
|
||||
return m_replaceable_.charAt(m_index_);
|
||||
}
|
||||
return DONE;
|
||||
public static final UCharacterIterator getInstance(Replaceable source){
|
||||
return new ReplaceableCharacterIterator(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current codepoint
|
||||
* @return current codepoint
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* source string.
|
||||
* @param source a string
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
*/
|
||||
public int currentCodePoint()
|
||||
{
|
||||
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
|
||||
return m_replaceable_.char32At(m_index_);
|
||||
}
|
||||
return DONE_CODEPOINT;
|
||||
public static final UCharacterIterator getInstance(String source){
|
||||
return new ReplaceableCharacterIterator(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the first UTF16 character in text.
|
||||
* @return the first UTF16 in text.
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* source character array.
|
||||
* @param source an array of UTF-16 code units
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
*/
|
||||
public char first()
|
||||
{
|
||||
m_index_ = m_start_;
|
||||
return current();
|
||||
public static final UCharacterIterator getInstance(char[] source){
|
||||
return getInstance(source,0,source.length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the start of the text to iterate.
|
||||
* @return by default this method will return 0, unless a range for
|
||||
* iteration had been specified during construction.
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* source character array.
|
||||
* @param source an array of UTF-16 code units
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
*/
|
||||
public int getBeginIndex()
|
||||
{
|
||||
return m_start_;
|
||||
public static final UCharacterIterator getInstance(char[] source, int start, int limit){
|
||||
return new UCharArrayIterator(source,start,limit);
|
||||
}
|
||||
/**
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* source StringBuffer.
|
||||
* @param source an string buffer of UTF-16 code units
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(StringBuffer source){
|
||||
return new ReplaceableCharacterIterator(source);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the limit offset of the text to iterate
|
||||
* @return by default this method returns the length of the text, unless a
|
||||
* range for iteration had been specified during construction.
|
||||
*/
|
||||
public int getEndIndex()
|
||||
{
|
||||
return m_limit_;
|
||||
* Returns a <code>UCharacterIterator</code> object given a
|
||||
* CharacterIterator.
|
||||
* @param source a valid CharacterIterator object.
|
||||
* @return UCharacterIterator object
|
||||
* @exception IllegalArgumentException if the argument is null
|
||||
*/
|
||||
public static final UCharacterIterator getInstance(CharacterIterator source){
|
||||
return new ICUCharacterIterator(source);
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
/**
|
||||
* Returns a <code>java.text.CharacterIterator</code> object for
|
||||
* the underlying text of this iterator. The returned iterator is
|
||||
* independent of this iterator.
|
||||
* @return java.text.CharacterIterator object
|
||||
*/
|
||||
public CharacterIterator getCharacterIterator(){
|
||||
return new StringCharacterIterator(this.getText());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the code unit at the current index. If index is out
|
||||
* of range, returns DONE. Index is not changed.
|
||||
* @return current code unit
|
||||
*/
|
||||
public abstract int current();
|
||||
|
||||
/**
|
||||
* Returns the codepoint at the current index.
|
||||
* If the current index is invalid, DONE is returned.
|
||||
* If the current index points to a lead surrogate, and there is a following
|
||||
* trail surrogate, then the code point is returned. Otherwise, the code
|
||||
* unit at index is returned. Index is not changed.
|
||||
* @return current codepoint
|
||||
*/
|
||||
public int currentCodePoint(){
|
||||
int ch = current();
|
||||
if(UTF16.isLeadSurrogate((char)ch)){
|
||||
// advance the index to get the
|
||||
// next code point
|
||||
next();
|
||||
// due to post increment semantics
|
||||
// current() after next() actually
|
||||
// returns the char we want
|
||||
int ch2 = current();
|
||||
// current should never change
|
||||
// the current index so back off
|
||||
previous();
|
||||
|
||||
if(UTF16.isTrailSurrogate((char)ch2)){
|
||||
// we found a surrogate pair
|
||||
// return the codepoint
|
||||
return UCharacterProperty.getRawSupplementary(
|
||||
(char)ch,(char)ch2
|
||||
);
|
||||
}
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the text
|
||||
* @return length of the text
|
||||
*/
|
||||
public abstract int getLength();
|
||||
|
||||
|
||||
/**
|
||||
* Gets the current index in text.
|
||||
* @return current index in text.
|
||||
*/
|
||||
public int getIndex()
|
||||
{
|
||||
return m_index_;
|
||||
public abstract int getIndex();
|
||||
|
||||
|
||||
/**
|
||||
* Returns the UTF16 code unit at index, and increments to the next
|
||||
* code unit (post-increment semantics). If index is out of
|
||||
* range, DONE is returned, and the iterator is reset to the limit
|
||||
* of the text.
|
||||
* @return the next UTF16 code unit, or DONE if the index is at the limit
|
||||
* of the text.
|
||||
*/
|
||||
public abstract int next();
|
||||
|
||||
/**
|
||||
* Returns the code point at index, and increments to the next code
|
||||
* point (post-increment semantics). If index does not point to a
|
||||
* valid surrogate pair, the behavior is the same as
|
||||
* <code>next()<code>. Otherwise the iterator is incremented past
|
||||
* the surrogate pair, and the code point represented by the pair
|
||||
* is returned.
|
||||
* @return the next codepoint in text, or DONE if the index is at
|
||||
* the limit of the text.
|
||||
*/
|
||||
public int nextCodePoint(){
|
||||
int ch1 = next();
|
||||
if(UTF16.isLeadSurrogate((char)ch1)){
|
||||
int ch2 = next();
|
||||
if(UTF16.isTrailSurrogate((char)ch2)){
|
||||
return UCharacterProperty.getRawSupplementary((char)ch1,
|
||||
(char)ch2);
|
||||
}else{
|
||||
// unmatched surrogate so back out
|
||||
previous();
|
||||
}
|
||||
}
|
||||
return ch1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decrement to the position of the previous code unit in the
|
||||
* text, and return it (pre-decrement semantics). If the
|
||||
* resulting index is less than 0, the index is reset to 0 and
|
||||
* DONE is returned.
|
||||
* @return the previous code unit in the text, or DONE if the new
|
||||
* index is before the start of the text.
|
||||
*/
|
||||
public abstract int previous();
|
||||
|
||||
|
||||
/**
|
||||
* Retreat to the start of the previous code point in the text,
|
||||
* and return it (pre-decrement semantics). If the index is not
|
||||
* preceeded by a valid surrogate pair, the behavior is the same
|
||||
* as <code>previous()</code>. Otherwise the iterator is
|
||||
* decremented to the start of the surrogate pair, and the code
|
||||
* point represented by the pair is returned.
|
||||
* @return the previous code point in the text, or DONE if the new
|
||||
* index is before the start of the text.
|
||||
*/
|
||||
public int previousCodePoint(){
|
||||
int ch1 = previous();
|
||||
if(UTF16.isTrailSurrogate((char)ch1)){
|
||||
int ch2 = previous();
|
||||
if(UTF16.isLeadSurrogate((char)ch2)){
|
||||
return UCharacterProperty.getRawSupplementary((char)ch2,
|
||||
(char)ch1);
|
||||
}else{
|
||||
//unmatched trail surrogate so back out
|
||||
next();
|
||||
}
|
||||
}
|
||||
return ch1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the index to the specified index in the text.
|
||||
* @param index the index within the text.
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid index is
|
||||
* supplied
|
||||
*/
|
||||
public abstract void setIndex(int index);
|
||||
|
||||
/**
|
||||
* Sets the current index to the limit.
|
||||
*/
|
||||
public void setToLimit() {
|
||||
setIndex(getLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the last UTF16 iterateable character from the text and shifts the
|
||||
* index to the end of the text accordingly.
|
||||
* @return the last UTF16 iterateable character
|
||||
* Sets the current index to the start.
|
||||
*/
|
||||
public char last()
|
||||
{
|
||||
if (m_limit_ != m_start_) {
|
||||
m_index_ = m_limit_ - 1;
|
||||
return m_replaceable_.charAt(m_index_);
|
||||
}
|
||||
m_index_ = m_limit_;
|
||||
return DONE;
|
||||
public void setToStart() {
|
||||
setIndex(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns next UTF16 character and increments the iterator's index by 1.
|
||||
* If the resulting index is greater or equal to the iteration limit, the
|
||||
* index is reset to the text iteration limit and a value of DONE_CODEPOINT is
|
||||
* returned.
|
||||
* @return next UTF16 character in text or DONE if the new index is off the
|
||||
* end of the text iteration limit.
|
||||
*/
|
||||
public char next()
|
||||
{
|
||||
if (m_index_ < m_limit_) {
|
||||
char result = m_replaceable_.charAt(m_index_);
|
||||
m_index_ ++;
|
||||
return result;
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns next codepoint after current index and increments the iterator's
|
||||
* index by a number depending on the returned codepoint.
|
||||
* This assumes the text is stored as 16-bit code units
|
||||
* with surrogate pairs intermixed. If the index of a leading or trailing
|
||||
* code unit of a surrogate pair is given, return the code point after the
|
||||
* surrogate pair.
|
||||
* If the resulting index is greater or equal to the text iterateable limit,
|
||||
* the current index is reset to the text iterateable limit and a value of
|
||||
* DONE_CODEPOINT is returned.
|
||||
* @return next codepoint in text or DONE_CODEPOINT if the new index is off the
|
||||
* end of the text iterateable limit.
|
||||
*/
|
||||
public int nextCodePoint()
|
||||
{
|
||||
if (m_index_ < m_limit_) {
|
||||
char ch = m_replaceable_.charAt(m_index_);
|
||||
m_index_ ++;
|
||||
if (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
|
||||
ch <= UTF16.LEAD_SURROGATE_MAX_VALUE &&
|
||||
m_index_ < m_limit_) {
|
||||
char trail = m_replaceable_.charAt(m_index_);
|
||||
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
|
||||
trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
|
||||
m_index_ ++;
|
||||
return UCharacterProperty.getRawSupplementary(ch,
|
||||
trail);
|
||||
}
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
return DONE_CODEPOINT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns previous UTF16 character and decrements the iterator's index by
|
||||
* 1.
|
||||
* If the resulting index is less than the text iterateable limit, the
|
||||
* index is reset to the start of the text iteration and a value of
|
||||
* DONE_CODEPOINT is returned.
|
||||
* @return next UTF16 character in text or DONE if the new index is off the
|
||||
* start of the text iteration range.
|
||||
* Fills the buffer with the underlying text storage of the iterator
|
||||
* If the buffer capacity is not enough a exception is thrown. The capacity
|
||||
* of the fill in buffer should at least be equal to length of text in the
|
||||
* iterator obtained by calling <code>getLength()</code).
|
||||
* <b>Usage:</b>
|
||||
*
|
||||
* <code>
|
||||
* <pre>
|
||||
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
|
||||
* char[] buf = new char[iter.getLength()];
|
||||
* iter.getText(buf);
|
||||
*
|
||||
* OR
|
||||
* char[] buf= new char[1];
|
||||
* int len = 0;
|
||||
* for(;;){
|
||||
* try{
|
||||
* len = iter.getText(buf);
|
||||
* break;
|
||||
* }catch(IndexOutOfBoundsException e){
|
||||
* buf = new char[iter.getLength()];
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
* </code>
|
||||
*
|
||||
* @param fillIn an array of chars to fill with the underlying UTF-16 code
|
||||
* units.
|
||||
* @param offset the position within the array to start putting the data.
|
||||
* @return the number of code units added to fillIn, as a convenience
|
||||
* @exception IndexOutOfBounds exception if there is not enough
|
||||
* room after offset in the array, or if offset < 0.
|
||||
*/
|
||||
public char previous()
|
||||
{
|
||||
if (m_index_ > m_start_) {
|
||||
m_index_ --;
|
||||
return m_replaceable_.charAt(m_index_);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns previous codepoint before current index and decrements the
|
||||
* iterator's index by a number depending on the returned codepoint.
|
||||
* This assumes the text is stored as 16-bit code units
|
||||
* with surrogate pairs intermixed. If the index of a leading or trailing
|
||||
* code unit of a surrogate pair is given, return the code point before the
|
||||
* surrogate pair.
|
||||
* If the resulting index is less than the text iterateable range, the
|
||||
* current index is reset to the start of the range and a value of
|
||||
* DONE_CODEPOINT is returned.
|
||||
* @return previous codepoint in text or DONE_CODEPOINT if the new index is
|
||||
* off the start of the text iteration range.
|
||||
*/
|
||||
public int previousCodePoint()
|
||||
{
|
||||
if (m_index_ > m_start_) {
|
||||
m_index_ --;
|
||||
char ch = m_replaceable_.charAt(m_index_);
|
||||
if (ch >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
|
||||
ch <= UTF16.TRAIL_SURROGATE_MAX_VALUE &&
|
||||
m_index_ > m_start_) {
|
||||
char lead = m_replaceable_.charAt(m_index_);
|
||||
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
|
||||
lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
m_index_ --;
|
||||
return UCharacterProperty.getRawSupplementary(ch,
|
||||
lead);
|
||||
}
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
return DONE_CODEPOINT;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Sets the index to the specified index in the text and returns that
|
||||
* single UTF16 character at index.
|
||||
* This assumes the text is stored as 16-bit code units.</p>
|
||||
* @param index the index within the text.
|
||||
* @exception IllegalArgumentException is thrown if an invalid index is
|
||||
* supplied. i.e. index is out of bounds.
|
||||
* @return the character at the specified index or DONE if the specified
|
||||
* index is equal to the limit of the text iteration range.
|
||||
*/
|
||||
public char setIndex(int index)
|
||||
{
|
||||
if (index < m_start_ || index > m_limit_) {
|
||||
throw new IllegalArgumentException("Index index out of bounds");
|
||||
public int getText(char[] fillIn, int offset) {
|
||||
int len = getLength();
|
||||
if (offset < 0 || offset + len > fillIn.length) {
|
||||
throw new IndexOutOfBoundsException(Integer.toString(offset));
|
||||
}
|
||||
m_index_ = index;
|
||||
return current();
|
||||
}
|
||||
|
||||
// private data members ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Replacable object
|
||||
*/
|
||||
private Replaceable m_replaceable_;
|
||||
/**
|
||||
* Current index
|
||||
*/
|
||||
private int m_index_;
|
||||
/**
|
||||
* Start offset of iterateable range, by default this is 0
|
||||
*/
|
||||
private int m_start_;
|
||||
/**
|
||||
* Limit offset of iterateable range, by default this is the length of the
|
||||
* string
|
||||
*/
|
||||
private int m_limit_;
|
||||
int index = getIndex();
|
||||
setToStart();
|
||||
int ch;
|
||||
while ((ch = next())!= DONE) {
|
||||
fillIn[offset++] = (char)ch;
|
||||
}
|
||||
setIndex(index);
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience override for <code>getText(char[], int)>/code> that provides
|
||||
* an offset of 0.
|
||||
* @param fillIn an array of chars to fill with the underlying UTF-16 code
|
||||
* units.
|
||||
* @return the number of code units added to fillIn, as a convenience
|
||||
* @exception IndexOutOfBounds exception if there is not enough
|
||||
* room in the array.
|
||||
*/
|
||||
public final int getText(char[] fillIn) {
|
||||
return getText(fillIn, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method for returning the underlying text storage as as string
|
||||
* @return the underlying text storage in the iterator as a string
|
||||
*/
|
||||
public String getText() {
|
||||
char[] text = new char[getLength()];
|
||||
getText(text);
|
||||
return new String(text);
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves the current position by the number of code units
|
||||
* specified, either forward or backward depending on the sign
|
||||
* of delta (positive or negative respectively). If the resulting
|
||||
* index would be less than zero, the index is set to zero, and if
|
||||
* the resulting index would be greater than limit, the index is
|
||||
* set to limit.
|
||||
*
|
||||
* @param delta the number of code units to move the current
|
||||
* index.
|
||||
* @return the new index.
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid index is
|
||||
* supplied
|
||||
*
|
||||
*/
|
||||
public int moveIndex(int delta) {
|
||||
int x = Math.max(0, Math.min(getIndex() + delta, getLength()));
|
||||
setIndex(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
/**
|
||||
* Moves the current position by the number of code points
|
||||
* specified, either forward or backward depending on the sign of
|
||||
* delta (positive or negative respectively). If the current index
|
||||
* is at a trail surrogate then the first adjustment is by code
|
||||
* unit, and the remaining adjustments are by code points. If the
|
||||
* resulting index would be less than zero, the index is set to
|
||||
* zero, and if the resulting index would be greater than limit,
|
||||
* the index is set to limit.
|
||||
* @param delta the number of code units to move the current index.
|
||||
* @return the new index
|
||||
* @exception IndexOutOfBoundsException is thrown if an invalid delta is
|
||||
* supplied
|
||||
*/
|
||||
public int moveCodePointIndex(int delta){
|
||||
if(delta>0){
|
||||
while(delta-->0 && nextCodePoint() != DONE);
|
||||
}else{
|
||||
while(delta++<0 && previousCodePoint() != DONE);
|
||||
}
|
||||
if(delta!=0){
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
|
||||
return getIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator, independent from other iterators.
|
||||
* If it is not possible to clone the iterator, returns null.
|
||||
* @return copy of this iterator
|
||||
*/
|
||||
public Object clone() throws CloneNotSupportedException{
|
||||
return super.clone();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -6,8 +6,8 @@
|
|||
*
|
||||
* $Source:
|
||||
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterPropertyDB.java $
|
||||
* $Date: 2002/04/04 00:52:27 $
|
||||
* $Revision: 1.8 $
|
||||
* $Date: 2002/06/20 01:18:09 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -760,7 +760,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* @return size of the lower case character in UTF16 format
|
||||
*/
|
||||
public int getSpecialLowerCase(Locale locale, int index, int ch,
|
||||
UCharacterIterator uchariter,
|
||||
UnicodeCharacterIterator uchariter,
|
||||
StringBuffer buffer)
|
||||
{
|
||||
int exception = getException(index,
|
||||
|
@ -874,7 +874,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* @return size of the lowercased codepoint in UTF16 format
|
||||
*/
|
||||
public int toLowerCase(Locale locale, int ch,
|
||||
UCharacterIterator uchariter,
|
||||
UnicodeCharacterIterator uchariter,
|
||||
StringBuffer buffer)
|
||||
{
|
||||
int props = getProperty(ch);
|
||||
|
@ -909,7 +909,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* @return size oflowercased codepoint in UTF16 format
|
||||
*/
|
||||
public int toLowerCase(Locale locale, int ch,
|
||||
UCharacterIterator uchariter, char buffer[])
|
||||
UnicodeCharacterIterator uchariter, char buffer[])
|
||||
{
|
||||
int props = getProperty(ch);
|
||||
if (!UCharacterProperty.isExceptionIndicator(props)) {
|
||||
|
@ -953,7 +953,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
public void toLowerCase(Locale locale, String str, int start, int limit,
|
||||
StringBuffer result)
|
||||
{
|
||||
UCharacterIterator ucharIter = new UCharacterIterator(str);
|
||||
UnicodeCharacterIterator ucharIter = new UnicodeCharacterIterator(str);
|
||||
int strIndex = start;
|
||||
|
||||
while (strIndex < limit) {
|
||||
|
@ -980,7 +980,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* @return size of uppercased codepoint in UTF16 format
|
||||
*/
|
||||
public int getSpecialUpperOrTitleCase(Locale locale, int index, int ch,
|
||||
UCharacterIterator uchariter,
|
||||
UnicodeCharacterIterator uchariter,
|
||||
boolean upperflag,
|
||||
StringBuffer buffer)
|
||||
{
|
||||
|
@ -1041,7 +1041,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* @return size of uppercased codepoint in UTF16 format
|
||||
*/
|
||||
public int toUpperOrTitleCase(Locale locale, int ch,
|
||||
UCharacterIterator uchariter,
|
||||
UnicodeCharacterIterator uchariter,
|
||||
boolean upperflag, StringBuffer buffer)
|
||||
{
|
||||
int props = getProperty(ch);
|
||||
|
@ -1083,7 +1083,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* @return size of uppercased codepoint in UTF16 format
|
||||
*/
|
||||
public int toUpperOrTitleCase(Locale locale, int ch,
|
||||
UCharacterIterator uchariter,
|
||||
UnicodeCharacterIterator uchariter,
|
||||
boolean upperflag, char buffer[])
|
||||
{
|
||||
int props = getProperty(ch);
|
||||
|
@ -1133,7 +1133,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
*/
|
||||
public String toUpperCase(Locale locale, String str, int start, int limit)
|
||||
{
|
||||
UCharacterIterator ucharIter = new UCharacterIterator(str);
|
||||
UnicodeCharacterIterator ucharIter = new UnicodeCharacterIterator(str);
|
||||
int strIndex = start;
|
||||
StringBuffer result = new StringBuffer(limit - start);
|
||||
|
||||
|
@ -1170,7 +1170,7 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
public String toTitleCase(Locale locale, String str,
|
||||
BreakIterator breakiter)
|
||||
{
|
||||
UCharacterIterator ucharIter = new UCharacterIterator(str);
|
||||
UnicodeCharacterIterator ucharIter = new UnicodeCharacterIterator(str);
|
||||
int length = str.length();
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
||||
|
@ -1583,13 +1583,13 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* the set { 'i', 'j', U+012f, U+1e2d, U+1ecb }
|
||||
* @see SpecialCasing.txt
|
||||
*/
|
||||
private static boolean isAFTER_i(UCharacterIterator uchariter, int offset)
|
||||
private static boolean isAFTER_i(UnicodeCharacterIterator uchariter, int offset)
|
||||
{
|
||||
uchariter.setIndex(offset);
|
||||
|
||||
int ch = uchariter.previousCodePoint();
|
||||
|
||||
while (ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
if (ch == LATIN_SMALL_LETTER_I_ || ch == LATIN_SMALL_LETTER_J_ ||
|
||||
ch == LATIN_SMALL_LETTER_I_WITH_OGONEK_ ||
|
||||
ch == LATIN_SMALL_LETTER_I_WITH_TILDE_BELOW_ ||
|
||||
|
@ -1618,13 +1618,13 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* character 'I' with no intervening combining class = 230
|
||||
* @see SpecialCasing.txt
|
||||
*/
|
||||
private static boolean isAFTER_I(UCharacterIterator uchariter, int offset)
|
||||
private static boolean isAFTER_I(UnicodeCharacterIterator uchariter, int offset)
|
||||
{
|
||||
uchariter.setIndex(offset);
|
||||
|
||||
int ch = uchariter.previousCodePoint();
|
||||
|
||||
while (ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
if (ch == LATIN_CAPITAL_LETTER_I_) {
|
||||
return true; // preceded by I
|
||||
}
|
||||
|
@ -1650,14 +1650,14 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* @return false if any character after offset in src is a cased letter
|
||||
* @see SpecialCasing.txt
|
||||
*/
|
||||
private boolean isCFINAL(UCharacterIterator uchariter, int offset)
|
||||
private boolean isCFINAL(UnicodeCharacterIterator uchariter, int offset)
|
||||
{
|
||||
// iterator should have been determined to be not null by caller
|
||||
uchariter.setIndex(offset);
|
||||
uchariter.nextCodePoint(); // rid of current codepoint
|
||||
int ch = uchariter.nextCodePoint(); // start checking
|
||||
|
||||
while (ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
int cat = getType(ch);
|
||||
if (cat == UCharacterCategory.LOWERCASE_LETTER ||
|
||||
cat == UCharacterCategory.UPPERCASE_LETTER ||
|
||||
|
@ -1681,13 +1681,13 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* @return true if any character before index in src is a cased letter
|
||||
* @see SpecialCasing.txt
|
||||
*/
|
||||
private boolean isNotCINITIAL(UCharacterIterator uchariter,
|
||||
private boolean isNotCINITIAL(UnicodeCharacterIterator uchariter,
|
||||
int offset)
|
||||
{
|
||||
uchariter.setIndex(offset);
|
||||
int ch = uchariter.previousCodePoint();
|
||||
|
||||
while (ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
int cat = getType(ch);
|
||||
if (cat == UCharacterCategory.LOWERCASE_LETTER ||
|
||||
cat == UCharacterCategory.UPPERCASE_LETTER ||
|
||||
|
@ -1712,14 +1712,14 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* of combining class = 230.
|
||||
* @see SpecialCasing.txt
|
||||
*/
|
||||
private static boolean isFollowedByMOREABOVE(UCharacterIterator uchariter,
|
||||
private static boolean isFollowedByMOREABOVE(UnicodeCharacterIterator uchariter,
|
||||
int offset)
|
||||
{
|
||||
uchariter.setIndex(offset);
|
||||
uchariter.nextCodePoint(); // rid of current codepoint
|
||||
int ch = uchariter.nextCodePoint(); // start checking
|
||||
|
||||
while (ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
int cc = NormalizerImpl.getCombiningClass(ch);
|
||||
if (cc == COMBINING_MARK_ABOVE_CLASS_) {
|
||||
return true; // at least one cc==230 following
|
||||
|
@ -1742,14 +1742,14 @@ public final class UCharacterProperty implements Trie.DataManipulate
|
|||
* with no characters of combining class == 230 in between
|
||||
* @see SpecialCasing.txt
|
||||
*/
|
||||
private static boolean isFollowedByDotAbove(UCharacterIterator uchariter,
|
||||
private static boolean isFollowedByDotAbove(UnicodeCharacterIterator uchariter,
|
||||
int offset)
|
||||
{
|
||||
uchariter.setIndex(offset);
|
||||
uchariter.nextCodePoint(); // rid off current character
|
||||
int ch = uchariter.nextCodePoint(); // start checking
|
||||
|
||||
while (ch != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (ch != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
if (ch == COMBINING_DOT_ABOVE_) {
|
||||
return true;
|
||||
}
|
||||
|
|
93
icu4j/src/com/ibm/icu/impl/UForwardCharacterIterator.java
Normal file
93
icu4j/src/com/ibm/icu/impl/UForwardCharacterIterator.java
Normal file
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2000, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/UForwardCharacterIterator.java,v $
|
||||
* $Date: 2002/06/20 01:18:09 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
/**
|
||||
* Interface that defines an API for forward-only iteration
|
||||
* on text objects.
|
||||
* This is a minimal interface for iteration without random access
|
||||
* or backwards iteration. It is especially useful for wrapping
|
||||
* streams with converters into an object for collation or
|
||||
* normalization.
|
||||
*
|
||||
* <p>Characters can be accessed in two ways: as code units or as
|
||||
* code points.
|
||||
* Unicode code points are 21-bit integers and are the scalar values
|
||||
* of Unicode characters. ICU uses the type <code>int</code> for them.
|
||||
* Unicode code units are the storage units of a given
|
||||
* Unicode/UCS Transformation Format (a character encoding scheme).
|
||||
* With UTF-16, all code points can be represented with either one
|
||||
* or two code units ("surrogates").
|
||||
* String storage is typically based on code units, while properties
|
||||
* of characters are typically determined using code point values.
|
||||
* Some processes may be designed to work with sequences of code units,
|
||||
* or it may be known that all characters that are important to an
|
||||
* algorithm can be represented with single code units.
|
||||
* Other processes will need to use the code point access functions.</p>
|
||||
*
|
||||
* <p>ForwardCharacterIterator provides next() to access
|
||||
* a code unit and advance an internal position into the text object,
|
||||
* similar to a <code>return text[position++]</code>.<br>
|
||||
* It provides nextCodePoint() to access a code point and advance an internal
|
||||
* position.</p>
|
||||
*
|
||||
* <p>nextCodePoint() assumes that the current position is that of
|
||||
* the beginning of a code point, i.e., of its first code unit.
|
||||
* After nextCodePoint(), this will be true again.
|
||||
* In general, access to code units and code points in the same
|
||||
* iteration loop should not be mixed. In UTF-16, if the current position
|
||||
* is on a second code unit (Low Surrogate), then only that code unit
|
||||
* is returned even by nextCodePoint().</p>
|
||||
*
|
||||
* Usage:
|
||||
* <code>
|
||||
* public void function1(UForwardCharacterIterator it) {
|
||||
* int c;
|
||||
* while((c=it.next())!=UForwardCharacterIterator.DONE) {
|
||||
* // use c
|
||||
* }
|
||||
* }
|
||||
* </code>
|
||||
* </p>
|
||||
*
|
||||
*/
|
||||
|
||||
public interface UForwardCharacterIterator {
|
||||
|
||||
/**
|
||||
* Indicator that we have reached the ends of the UTF16 text.
|
||||
*/
|
||||
public static final int DONE = -1;
|
||||
/**
|
||||
* Returns the UTF16 code unit at index, and increments to the next
|
||||
* code unit (post-increment semantics). If index is out of
|
||||
* range, DONE is returned, and the iterator is reset to the limit
|
||||
* of the text.
|
||||
* @return the next UTF16 code unit, or DONE if the index is at the limit
|
||||
* of the text.
|
||||
*/
|
||||
public int next();
|
||||
|
||||
/**
|
||||
* Returns the code point at index, and increments to the next code
|
||||
* point (post-increment semantics). If index does not point to a
|
||||
* valid surrogate pair, the behavior is the same as
|
||||
* <code>next()<code>. Otherwise the iterator is incremented past
|
||||
* the surrogate pair, and the code point represented by the pair
|
||||
* is returned.
|
||||
* @return the next codepoint in text, or DONE if the index is at
|
||||
* the limit of the text.
|
||||
*/
|
||||
public int nextCodePoint();
|
||||
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/USerializedSet.java,v $
|
||||
* $Date: 2002/03/28 01:50:59 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/06/20 01:18:09 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -31,6 +31,8 @@ public final class USerializedSet {
|
|||
arrayOffset=bmpLength=length=0;
|
||||
|
||||
length=src[srcStart++];
|
||||
|
||||
|
||||
if((length&0x8000) >0) {
|
||||
/* there are supplementary values */
|
||||
length&=0x7fff;
|
||||
|
@ -47,8 +49,9 @@ public final class USerializedSet {
|
|||
}
|
||||
bmpLength=length;
|
||||
}
|
||||
array=src;
|
||||
arrayOffset=srcStart;
|
||||
array = new char[length];
|
||||
System.arraycopy(src,srcStart,array,0,length);
|
||||
//arrayOffset=srcStart;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -83,9 +86,7 @@ public final class USerializedSet {
|
|||
if(rangeIndex<0) {
|
||||
return false;
|
||||
}
|
||||
if(array==null){
|
||||
array = new char[8];
|
||||
}
|
||||
|
||||
range=new int[2];
|
||||
|
||||
rangeIndex*=2; /* address start/limit pairs */
|
||||
|
@ -122,7 +123,7 @@ public final class USerializedSet {
|
|||
if( 0x10ffff<c) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if(c<0xffff) {
|
||||
bmpLength=length=2;
|
||||
array[0]=(char)c;
|
||||
|
@ -157,7 +158,9 @@ public final class USerializedSet {
|
|||
if(array==null){
|
||||
array = new char[8];
|
||||
}
|
||||
range=new int[2];
|
||||
if(range==null || range.length <2){
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
rangeIndex*=2; /* address start/limit pairs */
|
||||
if(rangeIndex<bmpLength) {
|
||||
range[0]=array[rangeIndex++];
|
||||
|
@ -168,6 +171,7 @@ public final class USerializedSet {
|
|||
} else {
|
||||
range[1]=0x110000;
|
||||
}
|
||||
range[1]-=1;
|
||||
return true;
|
||||
} else {
|
||||
rangeIndex-=bmpLength;
|
||||
|
@ -182,7 +186,8 @@ public final class USerializedSet {
|
|||
} else {
|
||||
range[1]=0x110000;
|
||||
}
|
||||
return false;
|
||||
range[1]-=1;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
|
@ -216,6 +221,6 @@ public final class USerializedSet {
|
|||
return (bmpLength+(length-bmpLength)/2+1)/2;
|
||||
}
|
||||
|
||||
private char array[];
|
||||
private char array[] = new char[8];
|
||||
private int arrayOffset, bmpLength, length;
|
||||
}
|
339
icu4j/src/com/ibm/icu/impl/UnicodeCharacterIterator.java
Normal file
339
icu4j/src/com/ibm/icu/impl/UnicodeCharacterIterator.java
Normal file
|
@ -0,0 +1,339 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2000, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Attic/UnicodeCharacterIterator.java,v $
|
||||
* $Date: 2002/06/20 01:18:09 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.text.Replaceable;
|
||||
import com.ibm.icu.text.ReplaceableString;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import java.text.CharacterIterator;
|
||||
|
||||
/**
|
||||
* Internal class that iterates through a com.ibm.text.Replacable text object
|
||||
* to return either Unicode characters.
|
||||
* @author synwee
|
||||
* @version release 2.1, February 2002
|
||||
*/
|
||||
public final class UnicodeCharacterIterator implements CharacterIterator
|
||||
{
|
||||
// public data members -----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Indicator that we have reached the ends of the UTF16 text when returning
|
||||
* 16 bit character.
|
||||
*/
|
||||
public static final int DONE = 0xFFFF;
|
||||
/**
|
||||
* Indicator that we have reached the ends of the UTF16 text when returning
|
||||
* codepoints.
|
||||
*/
|
||||
public static final int DONE_CODEPOINT = -1;
|
||||
|
||||
// public constructor ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Public constructor.
|
||||
* By default the iteration range will be from 0 to the end of the text.
|
||||
* @param replacable text which the iterator will be based on
|
||||
*/
|
||||
public UnicodeCharacterIterator(Replaceable replaceable)
|
||||
{
|
||||
m_replaceable_ = replaceable;
|
||||
m_index_ = 0;
|
||||
m_start_ = 0;
|
||||
m_limit_ = replaceable.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Public constructor
|
||||
* By default the iteration range will be from 0 to the end of the text.
|
||||
* @param str text which the iterator will be based on
|
||||
*/
|
||||
public UnicodeCharacterIterator(String str)
|
||||
{
|
||||
m_replaceable_ = new ReplaceableString(str);
|
||||
m_index_ = 0;
|
||||
m_start_ = 0;
|
||||
m_limit_ = m_replaceable_.length();
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an iterator over the given range of the given string.
|
||||
* @param text text to be iterated over
|
||||
* @param start offset of the first character to iterate
|
||||
* @param limit offset of the character following the last character to
|
||||
* iterate
|
||||
*/
|
||||
public UnicodeCharacterIterator(String str, int start, int limit)
|
||||
{
|
||||
m_replaceable_ = new ReplaceableString(str);
|
||||
m_start_ = start;
|
||||
m_limit_ = limit;
|
||||
m_index_ = m_start_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an iterator over the given range of the given replaceable
|
||||
* string.
|
||||
* @param text text to be iterated over
|
||||
* @param start offset of the first character to iterate
|
||||
* @param limit offset of the character following the last character to
|
||||
* iterate
|
||||
*/
|
||||
public UnicodeCharacterIterator(Replaceable replaceable, int start, int limit)
|
||||
{
|
||||
m_replaceable_ = replaceable;
|
||||
m_start_ = start;
|
||||
m_limit_ = limit;
|
||||
m_index_ = m_start_;
|
||||
}
|
||||
|
||||
// public methods ----------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Creates a copy of this iterator.
|
||||
* Cloning will not duplicate a new Replaceable object.
|
||||
* @return copy of this iterator
|
||||
*/
|
||||
public Object clone()
|
||||
{
|
||||
try {
|
||||
return super.clone();
|
||||
}
|
||||
catch (CloneNotSupportedException e) {
|
||||
throw new InternalError(
|
||||
"Cloning by the super class java.text.CharacterIterator is not " +
|
||||
"supported");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current UTF16 character.
|
||||
* @return current UTF16 character
|
||||
*/
|
||||
public char current()
|
||||
{
|
||||
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
|
||||
return m_replaceable_.charAt(m_index_);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current codepoint
|
||||
* @return current codepoint
|
||||
*/
|
||||
public int currentCodePoint()
|
||||
{
|
||||
if (m_index_ >= m_start_ && m_index_ < m_limit_) {
|
||||
return m_replaceable_.char32At(m_index_);
|
||||
}
|
||||
return DONE_CODEPOINT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the first UTF16 character in text.
|
||||
* @return the first UTF16 in text.
|
||||
*/
|
||||
public char first()
|
||||
{
|
||||
m_index_ = m_start_;
|
||||
return current();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the start of the text to iterate.
|
||||
* @return by default this method will return 0, unless a range for
|
||||
* iteration had been specified during construction.
|
||||
*/
|
||||
public int getBeginIndex()
|
||||
{
|
||||
return m_start_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the limit offset of the text to iterate
|
||||
* @return by default this method returns the length of the text, unless a
|
||||
* range for iteration had been specified during construction.
|
||||
*/
|
||||
public int getEndIndex()
|
||||
{
|
||||
return m_limit_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the current index in text.
|
||||
* @return current index in text.
|
||||
*/
|
||||
public int getIndex()
|
||||
{
|
||||
return m_index_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the last UTF16 iterateable character from the text and shifts the
|
||||
* index to the end of the text accordingly.
|
||||
* @return the last UTF16 iterateable character
|
||||
*/
|
||||
public char last()
|
||||
{
|
||||
if (m_limit_ != m_start_) {
|
||||
m_index_ = m_limit_ - 1;
|
||||
return m_replaceable_.charAt(m_index_);
|
||||
}
|
||||
m_index_ = m_limit_;
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns next UTF16 character and increments the iterator's index by 1.
|
||||
* If the resulting index is greater or equal to the iteration limit, the
|
||||
* index is reset to the text iteration limit and a value of DONE_CODEPOINT is
|
||||
* returned.
|
||||
* @return next UTF16 character in text or DONE if the new index is off the
|
||||
* end of the text iteration limit.
|
||||
*/
|
||||
public char next()
|
||||
{
|
||||
if (m_index_ < m_limit_) {
|
||||
char result = m_replaceable_.charAt(m_index_);
|
||||
m_index_ ++;
|
||||
return result;
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns next codepoint after current index and increments the iterator's
|
||||
* index by a number depending on the returned codepoint.
|
||||
* This assumes the text is stored as 16-bit code units
|
||||
* with surrogate pairs intermixed. If the index of a leading or trailing
|
||||
* code unit of a surrogate pair is given, return the code point after the
|
||||
* surrogate pair.
|
||||
* If the resulting index is greater or equal to the text iterateable limit,
|
||||
* the current index is reset to the text iterateable limit and a value of
|
||||
* DONE_CODEPOINT is returned.
|
||||
* @return next codepoint in text or DONE_CODEPOINT if the new index is off the
|
||||
* end of the text iterateable limit.
|
||||
*/
|
||||
public int nextCodePoint()
|
||||
{
|
||||
if (m_index_ < m_limit_) {
|
||||
char ch = m_replaceable_.charAt(m_index_);
|
||||
m_index_ ++;
|
||||
if (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
|
||||
ch <= UTF16.LEAD_SURROGATE_MAX_VALUE &&
|
||||
m_index_ < m_limit_) {
|
||||
char trail = m_replaceable_.charAt(m_index_);
|
||||
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
|
||||
trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
|
||||
m_index_ ++;
|
||||
return UCharacterProperty.getRawSupplementary(ch,
|
||||
trail);
|
||||
}
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
return DONE_CODEPOINT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns previous UTF16 character and decrements the iterator's index by
|
||||
* 1.
|
||||
* If the resulting index is less than the text iterateable limit, the
|
||||
* index is reset to the start of the text iteration and a value of
|
||||
* DONE_CODEPOINT is returned.
|
||||
* @return next UTF16 character in text or DONE if the new index is off the
|
||||
* start of the text iteration range.
|
||||
*/
|
||||
public char previous()
|
||||
{
|
||||
if (m_index_ > m_start_) {
|
||||
m_index_ --;
|
||||
return m_replaceable_.charAt(m_index_);
|
||||
}
|
||||
return DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns previous codepoint before current index and decrements the
|
||||
* iterator's index by a number depending on the returned codepoint.
|
||||
* This assumes the text is stored as 16-bit code units
|
||||
* with surrogate pairs intermixed. If the index of a leading or trailing
|
||||
* code unit of a surrogate pair is given, return the code point before the
|
||||
* surrogate pair.
|
||||
* If the resulting index is less than the text iterateable range, the
|
||||
* current index is reset to the start of the range and a value of
|
||||
* DONE_CODEPOINT is returned.
|
||||
* @return previous codepoint in text or DONE_CODEPOINT if the new index is
|
||||
* off the start of the text iteration range.
|
||||
*/
|
||||
public int previousCodePoint()
|
||||
{
|
||||
if (m_index_ > m_start_) {
|
||||
m_index_ --;
|
||||
char ch = m_replaceable_.charAt(m_index_);
|
||||
if (ch >= UTF16.TRAIL_SURROGATE_MIN_VALUE &&
|
||||
ch <= UTF16.TRAIL_SURROGATE_MAX_VALUE &&
|
||||
m_index_ > m_start_) {
|
||||
char lead = m_replaceable_.charAt(m_index_);
|
||||
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE &&
|
||||
lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
|
||||
m_index_ --;
|
||||
return UCharacterProperty.getRawSupplementary(ch,
|
||||
lead);
|
||||
}
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
return DONE_CODEPOINT;
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Sets the index to the specified index in the text and returns that
|
||||
* single UTF16 character at index.
|
||||
* This assumes the text is stored as 16-bit code units.</p>
|
||||
* @param index the index within the text.
|
||||
* @exception IllegalArgumentException is thrown if an invalid index is
|
||||
* supplied. i.e. index is out of bounds.
|
||||
* @return the character at the specified index or DONE if the specified
|
||||
* index is equal to the limit of the text iteration range.
|
||||
*/
|
||||
public char setIndex(int index)
|
||||
{
|
||||
if (index < m_start_ || index > m_limit_) {
|
||||
throw new IllegalArgumentException("Index index out of bounds");
|
||||
}
|
||||
m_index_ = index;
|
||||
return current();
|
||||
}
|
||||
|
||||
// private data members ----------------------------------------------------
|
||||
|
||||
/**
|
||||
* Replacable object
|
||||
*/
|
||||
private Replaceable m_replaceable_;
|
||||
/**
|
||||
* Current index
|
||||
*/
|
||||
private int m_index_;
|
||||
/**
|
||||
* Start offset of iterateable range, by default this is 0
|
||||
*/
|
||||
private int m_start_;
|
||||
/**
|
||||
* Limit offset of iterateable range, by default this is the length of the
|
||||
* string
|
||||
*/
|
||||
private int m_limit_;
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/Utility.java,v $
|
||||
* $Date: 2002/02/25 22:43:57 $
|
||||
* $Revision: 1.23 $
|
||||
* $Date: 2002/06/20 01:18:09 $
|
||||
* $Revision: 1.24 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -91,6 +91,25 @@ public final class Utility {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience utility to compare two Object[]s
|
||||
* Ought to be in System.
|
||||
* @param len the length to compare.
|
||||
* The start indices and start+len must be valid.
|
||||
*/
|
||||
public final static boolean arrayRegionMatches(char[] source, int sourceStart,
|
||||
char[] target, int targetStart,
|
||||
int len)
|
||||
{
|
||||
int sourceEnd = sourceStart + len;
|
||||
int delta = targetStart - sourceStart;
|
||||
for (int i = sourceStart; i < sourceEnd; i++) {
|
||||
if (source[i]!=target[i + delta])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience utility to compare two int[]s.
|
||||
* @param len the length to compare.
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:665f02a0fd842a47ca65ecf36c1d301ef5cae01990b68f05695cfc693a783406
|
||||
size 106300
|
||||
oid sha256:a5b2036d17d077b24f01e187e005a8cd3d84bfd9fea94c505eb24db9ca57492a
|
||||
size 108044
|
||||
|
|
|
@ -5,14 +5,14 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/BOSCU.java,v $
|
||||
* $Date: 2002/05/14 16:48:48 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2002/06/20 01:21:18 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import com.ibm.icu.impl.UCharacterIterator;
|
||||
import com.ibm.icu.impl.UnicodeCharacterIterator;
|
||||
|
||||
/**
|
||||
* <p>Binary Ordered Compression Scheme for Unicode</p>
|
||||
|
@ -105,9 +105,9 @@ public class BOSCU
|
|||
int offset)
|
||||
{
|
||||
int prev = 0;
|
||||
UCharacterIterator iterator = new UCharacterIterator(source);
|
||||
UnicodeCharacterIterator iterator = new UnicodeCharacterIterator(source);
|
||||
int codepoint = iterator.nextCodePoint();
|
||||
while (codepoint != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (codepoint != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
if (prev < 0x4e00 || prev >= 0xa000) {
|
||||
prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
|
||||
}
|
||||
|
@ -133,9 +133,9 @@ public class BOSCU
|
|||
{
|
||||
int prev = 0;
|
||||
int result = 0;
|
||||
UCharacterIterator iterator = new UCharacterIterator(source);
|
||||
UnicodeCharacterIterator iterator = new UnicodeCharacterIterator(source);
|
||||
int codepoint = iterator.nextCodePoint();
|
||||
while (codepoint != UCharacterIterator.DONE_CODEPOINT) {
|
||||
while (codepoint != UnicodeCharacterIterator.DONE_CODEPOINT) {
|
||||
if (prev < 0x4e00 || prev >= 0xa000) {
|
||||
prev = (prev & ~0x7f) - SLOPE_REACH_NEG_1_;
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CanonicalIterator.java,v $
|
||||
* $Date: 2002/03/20 22:55:33 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/06/20 01:21:18 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -17,7 +17,8 @@ import com.ibm.icu.lang.*;
|
|||
import java.util.Enumeration;
|
||||
import java.util.Vector;
|
||||
import java.util.*;
|
||||
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.USerializedSet;
|
||||
/**
|
||||
* This class allows one to iterate through all the strings that are canonically equivalent to a given
|
||||
* string. For example, here are some sample results:
|
||||
|
@ -103,7 +104,7 @@ public class CanonicalIterator {
|
|||
* while changing the source string, saving object creation.
|
||||
*/
|
||||
public void setSource(String newSource) {
|
||||
source = Normalizer.normalize(newSource, Normalizer.DECOMP, 0);
|
||||
source = Normalizer.normalize(newSource, Normalizer.NFD);
|
||||
done = false;
|
||||
|
||||
// catch degenerate case
|
||||
|
@ -122,9 +123,10 @@ public class CanonicalIterator {
|
|||
// i should be the end of the first code point
|
||||
|
||||
int i = UTF16.findOffsetFromCodePoint(source, 1);
|
||||
|
||||
for (; i < source.length(); i += UTF16.getCharCount(i)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
if (SAFE_START.contains(cp)) {
|
||||
if (NormalizerImpl.isCanonSafeStart(cp)) {
|
||||
list.add(source.substring(start, i)); // add up to i
|
||||
start = i;
|
||||
}
|
||||
|
@ -195,21 +197,21 @@ public class CanonicalIterator {
|
|||
/**
|
||||
*@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition.
|
||||
*@internal
|
||||
*/
|
||||
*
|
||||
public static UnicodeSet getSafeStart() {
|
||||
return (UnicodeSet) SAFE_START.clone();
|
||||
}
|
||||
|
||||
*/
|
||||
/**
|
||||
*@return the set of characters whose decompositions start with the given character
|
||||
*@internal
|
||||
*/
|
||||
*
|
||||
public static UnicodeSet getStarts(int cp) {
|
||||
UnicodeSet result = AT_START.get(cp);
|
||||
if (result == null) result = EMPTY;
|
||||
return (UnicodeSet) result.clone();
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
// ===================== PRIVATES ==============================
|
||||
|
||||
|
@ -253,7 +255,7 @@ public class CanonicalIterator {
|
|||
String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0);
|
||||
if (attempt.equals(segment)) {
|
||||
*/
|
||||
if (Normalizer.isEquivalent(possible, segment, Normalizer.DECOMP, 0)) {
|
||||
if (Normalizer.compare(possible, segment,0)==0) {
|
||||
|
||||
if (PROGRESS) System.out.println("Adding Permutation: " + NAME.transliterate(possible));
|
||||
result.add(possible);
|
||||
|
@ -272,6 +274,54 @@ public class CanonicalIterator {
|
|||
|
||||
|
||||
private Set getEquivalents2(String segment) {
|
||||
|
||||
Set result = new HashSet();
|
||||
|
||||
if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
|
||||
|
||||
result.add(segment);
|
||||
StringBuffer workingBuffer = new StringBuffer();
|
||||
|
||||
// cycle through all the characters
|
||||
int cp=0,end=0;
|
||||
int[] range = new int[2];
|
||||
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
|
||||
|
||||
// see if any character is at the start of some decomposition
|
||||
cp = UTF16.charAt(segment, i);;
|
||||
USerializedSet starts = new USerializedSet();
|
||||
|
||||
if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
|
||||
continue;
|
||||
}
|
||||
int j=0;
|
||||
// if so, see which decompositions match
|
||||
for(j = 0, cp = end+1; cp <= end ||starts.getSerializedRange(j++, range); ++cp) {
|
||||
if(cp>end){
|
||||
cp=range[0];
|
||||
end=range[1];
|
||||
}
|
||||
|
||||
Set remainder = extract(cp, segment, i,workingBuffer);
|
||||
if (remainder == null) continue;
|
||||
|
||||
// there were some matches, so add all the possibilities to the set.
|
||||
String prefix= segment.substring(0,i);
|
||||
prefix += UTF16.valueOf(cp);
|
||||
int el = -1;
|
||||
Iterator iter = remainder.iterator();
|
||||
while (iter.hasNext()) {
|
||||
String item = (String) iter.next();
|
||||
String toAdd = new String(prefix);
|
||||
toAdd += item;
|
||||
result.add(toAdd);
|
||||
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return result;
|
||||
/*
|
||||
Set result = new HashSet();
|
||||
if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment));
|
||||
result.add(segment);
|
||||
|
@ -283,6 +333,7 @@ public class CanonicalIterator {
|
|||
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
|
||||
// see if any character is at the start of some decomposition
|
||||
cp = UTF16.charAt(segment, i);
|
||||
NormalizerImpl.getCanonStartSet(c,fillSet)
|
||||
UnicodeSet starts = AT_START.get(cp);
|
||||
if (starts == null) continue;
|
||||
UnicodeSetIterator usi = new UnicodeSetIterator(starts);
|
||||
|
@ -305,6 +356,7 @@ public class CanonicalIterator {
|
|||
}
|
||||
}
|
||||
return result;
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -317,7 +369,7 @@ public class CanonicalIterator {
|
|||
+ ", " + NAME.transliterate(segment.substring(segmentPos)));
|
||||
|
||||
//String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
|
||||
String decomp = Normalizer.normalize(comp, Normalizer.DECOMP, 0);
|
||||
String decomp = Normalizer.normalize(comp, Normalizer.NFD);
|
||||
|
||||
// See if it matches the start of segment (at segmentPos)
|
||||
boolean ok = false;
|
||||
|
@ -369,7 +421,7 @@ public class CanonicalIterator {
|
|||
if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null;
|
||||
*/
|
||||
|
||||
if (!Normalizer.isEquivalent(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), Normalizer.DECOMP, 0)) return null;
|
||||
if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null;
|
||||
|
||||
// get the remaining combinations
|
||||
return getEquivalents2(remainder);
|
||||
|
@ -392,16 +444,18 @@ public class CanonicalIterator {
|
|||
SET_WITH_NULL_STRING.add("");
|
||||
}
|
||||
|
||||
private static UnicodeSet SAFE_START = new UnicodeSet();
|
||||
private static CharMap AT_START = new CharMap();
|
||||
// private static UnicodeSet SAFE_START = new UnicodeSet();
|
||||
// private static CharMap AT_START = new CharMap();
|
||||
|
||||
// TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!;
|
||||
// Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded.
|
||||
private static int LAST_UNICODE = 0x10FFFF;
|
||||
// private static int LAST_UNICODE = 0x10FFFF;
|
||||
/*
|
||||
static {
|
||||
buildData();
|
||||
}
|
||||
|
||||
*/
|
||||
/*
|
||||
private static void buildData() {
|
||||
|
||||
if (PROGRESS) System.out.println("Getting Safe Start");
|
||||
|
@ -417,10 +471,10 @@ public class CanonicalIterator {
|
|||
for (int cp = 0; cp <= LAST_UNICODE; ++cp) {
|
||||
if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.');
|
||||
|
||||
if (Normalizer.isNormalized(cp, Normalizer.DECOMP, 0)) continue;
|
||||
if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue;
|
||||
|
||||
//String istr = UTF16.valueOf(cp);
|
||||
String decomp = Normalizer.normalize(cp, Normalizer.DECOMP, 0);
|
||||
String decomp = Normalizer.normalize(cp, Normalizer.NFD);
|
||||
//if (decomp.equals(istr)) continue;
|
||||
|
||||
// add each character in the decomposition to canBeIn
|
||||
|
@ -437,7 +491,7 @@ public class CanonicalIterator {
|
|||
}
|
||||
if (PROGRESS) System.out.println();
|
||||
}
|
||||
|
||||
*/
|
||||
// the following is just for a map from characters to a set of characters
|
||||
|
||||
private static class CharMap {
|
||||
|
|
|
@ -5,12 +5,14 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/ComposedCharIter.java,v $
|
||||
* $Date: 2002/02/16 03:06:05 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2002/06/20 01:21:18 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
|
||||
/**
|
||||
* <tt>ComposedCharIter</tt> is an iterator class that returns all
|
||||
|
@ -51,6 +53,7 @@ package com.ibm.icu.text;
|
|||
* <tt>ComposedCharIter</tt> is currently based on version 2.1.8 of the
|
||||
* <a href="http://www.unicode.org" target="unicode">Unicode Standard</a>.
|
||||
* It will be updated as later versions of Unicode are released.
|
||||
* @deprecated
|
||||
*/
|
||||
public final class ComposedCharIter {
|
||||
|
||||
|
@ -59,7 +62,7 @@ public final class ComposedCharIter {
|
|||
* {@link #next} returns this value when there are no more composed characters
|
||||
* over which to iterate.
|
||||
*/
|
||||
public static final char DONE = Normalizer.DONE;
|
||||
public static final char DONE = (char) Normalizer.DONE;
|
||||
|
||||
/**
|
||||
* Construct a new <tt>ComposedCharIter</tt>. The iterator will return
|
||||
|
@ -67,8 +70,8 @@ public final class ComposedCharIter {
|
|||
* Hangul characters.
|
||||
*/
|
||||
public ComposedCharIter() {
|
||||
minDecomp = DecompData.MAX_COMPAT;
|
||||
hangul = false;
|
||||
compat = false;
|
||||
options =0;
|
||||
}
|
||||
|
||||
|
||||
|
@ -86,10 +89,8 @@ public final class ComposedCharIter {
|
|||
* Jamo decompositions.
|
||||
*/
|
||||
public ComposedCharIter(boolean compat, int options) {
|
||||
// Compatibility explosions have lower indices; skip them if necessary
|
||||
minDecomp = compat ? 0 : DecompData.MAX_COMPAT;
|
||||
|
||||
hangul = (options & Normalizer.IGNORE_HANGUL) == 0;
|
||||
this.compat = compat;
|
||||
this.options = options;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -97,10 +98,10 @@ public final class ComposedCharIter {
|
|||
* by {@link #next}.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
if (nextChar == DONE) {
|
||||
if (nextChar == Normalizer.DONE) {
|
||||
findNextChar();
|
||||
}
|
||||
return nextChar != DONE;
|
||||
return nextChar != Normalizer.DONE;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -111,12 +112,12 @@ public final class ComposedCharIter {
|
|||
* to <tt>next</tt> will return {@link #DONE}.
|
||||
*/
|
||||
public char next() {
|
||||
if (nextChar == DONE) {
|
||||
if (nextChar == Normalizer.DONE) {
|
||||
findNextChar();
|
||||
}
|
||||
curChar = nextChar;
|
||||
nextChar = DONE;
|
||||
return curChar;
|
||||
nextChar = Normalizer.DONE;
|
||||
return (char) curChar;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -126,42 +127,38 @@ public final class ComposedCharIter {
|
|||
* affected by the settings of the options passed to the constructor.
|
||||
*/
|
||||
public String decomposition() {
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
||||
int pos = (char)(DecompData.offsets.elementAt(curChar) & DecompData.DECOMP_MASK);
|
||||
|
||||
if (pos > minDecomp) {
|
||||
Normalizer.doAppend(DecompData.contents, pos, result);
|
||||
|
||||
|
||||
} else if (hangul && curChar >= HANGUL_BASE && curChar < HANGUL_LIMIT) {
|
||||
Normalizer.hangulToJamo(curChar, result, minDecomp);
|
||||
} else {
|
||||
result.append(curChar);
|
||||
}
|
||||
return result.toString();
|
||||
// the decomposition buffer contains the decomposition of
|
||||
// current char so just return it
|
||||
return new String(decompBuf,0, bufLen);
|
||||
}
|
||||
|
||||
private void findNextChar() {
|
||||
if (curChar != DONE) {
|
||||
char ch = curChar;
|
||||
while (++ch < 0xFFFF) {
|
||||
int offset = DecompData.offsets.elementAt(ch) & DecompData.DECOMP_MASK;
|
||||
if (offset > minDecomp
|
||||
|| (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) ) {
|
||||
nextChar = ch;
|
||||
int c=curChar+1;
|
||||
for(;;){
|
||||
if(c < 0xFFFF){
|
||||
bufLen = NormalizerImpl.getDecomposition(c,compat,
|
||||
decompBuf,0,
|
||||
decompBuf.length);
|
||||
if(bufLen>0){
|
||||
// the curChar can be decomposed... so it is a composed char
|
||||
// cache the result
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
c++;
|
||||
}else{
|
||||
c=Normalizer.DONE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
nextChar=c;
|
||||
}
|
||||
|
||||
private final int minDecomp;
|
||||
private final boolean hangul;
|
||||
private int options;
|
||||
private boolean compat;
|
||||
private char[] decompBuf = new char[100];
|
||||
private int bufLen=0;
|
||||
private int curChar = 0;
|
||||
private int nextChar = Normalizer.DONE;
|
||||
|
||||
private char curChar = 0;
|
||||
private char nextChar = Normalizer.DONE;
|
||||
|
||||
private static final char HANGUL_BASE = Normalizer.HANGUL_BASE;
|
||||
private static final char HANGUL_LIMIT = Normalizer.HANGUL_LIMIT;
|
||||
|
||||
};
|
||||
|
|
|
@ -5,15 +5,15 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/LowercaseTransliterator.java,v $
|
||||
* $Date: 2002/04/03 00:00:00 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2002/06/20 01:21:18 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
import java.util.*;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.impl.UCharacterIterator;
|
||||
import com.ibm.icu.impl.UnicodeCharacterIterator;
|
||||
|
||||
/**
|
||||
* A transliterator that performs locale-sensitive toLower()
|
||||
|
@ -63,7 +63,7 @@ class LowercaseTransliterator extends Transliterator{
|
|||
// get string for context
|
||||
// TODO: add convenience method to do this, since we do it all over
|
||||
|
||||
UCharacterIterator original = new UCharacterIterator(text);
|
||||
UnicodeCharacterIterator original = new UnicodeCharacterIterator(text);
|
||||
|
||||
// Walk through original string
|
||||
// If there is a case change, modify corresponding position in replaceable
|
||||
|
|
|
@ -14,7 +14,7 @@ import com.ibm.icu.lang.*;
|
|||
|
||||
/**
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: NormalizationTransliterator.java,v $ $Revision: 1.17 $ $Date: 2002/02/25 22:43:58 $
|
||||
* @version $RCSfile: NormalizationTransliterator.java,v $ $Revision: 1.18 $ $Date: 2002/06/20 01:21:18 $
|
||||
*/
|
||||
final class NormalizationTransliterator extends Transliterator {
|
||||
|
||||
|
@ -57,25 +57,25 @@ final class NormalizationTransliterator extends Transliterator {
|
|||
Transliterator.registerFactory("Any-NFC", new Transliterator.Factory() {
|
||||
public Transliterator getInstance(String ID) {
|
||||
return NormalizationTransliterator.
|
||||
getInstance(Normalizer.COMPOSE);
|
||||
getInstance(Normalizer.NFC);
|
||||
}
|
||||
});
|
||||
Transliterator.registerFactory("Any-NFD", new Transliterator.Factory() {
|
||||
public Transliterator getInstance(String ID) {
|
||||
return NormalizationTransliterator.
|
||||
getInstance(Normalizer.DECOMP);
|
||||
getInstance(Normalizer.NFD);
|
||||
}
|
||||
});
|
||||
Transliterator.registerFactory("Any-NFKC", new Transliterator.Factory() {
|
||||
public Transliterator getInstance(String ID) {
|
||||
return NormalizationTransliterator.
|
||||
getInstance(Normalizer.COMPOSE_COMPAT);
|
||||
getInstance(Normalizer.NFKC);
|
||||
}
|
||||
});
|
||||
Transliterator.registerFactory("Any-NFKD", new Transliterator.Factory() {
|
||||
public Transliterator getInstance(String ID) {
|
||||
return NormalizationTransliterator.
|
||||
getInstance(Normalizer.DECOMP_COMPAT);
|
||||
getInstance(Normalizer.NFKD);
|
||||
}
|
||||
});
|
||||
Transliterator.registerSpecialInverse("NFC", "NFD", true);
|
||||
|
@ -89,7 +89,21 @@ final class NormalizationTransliterator extends Transliterator {
|
|||
int opt) {
|
||||
StringBuffer id = new StringBuffer("NF");
|
||||
int choice = 0;
|
||||
if (m.compat()) {
|
||||
if(m==Normalizer.NFC){
|
||||
id.append("C");
|
||||
choice |= C;
|
||||
}else if(m==Normalizer.NFKC){
|
||||
id.append("KC");
|
||||
choice |= KC;
|
||||
}else if(m==Normalizer.NFD){
|
||||
id.append("D");
|
||||
choice |= D;
|
||||
}else if(m==Normalizer.NFKD){
|
||||
id.append("KD");
|
||||
choice |= KD;
|
||||
}
|
||||
|
||||
/*if (m.compat()) {
|
||||
id.append('K');
|
||||
choice |= KD;
|
||||
}
|
||||
|
@ -98,7 +112,7 @@ final class NormalizationTransliterator extends Transliterator {
|
|||
choice |= C;
|
||||
} else {
|
||||
id.append('D');
|
||||
}
|
||||
}*/
|
||||
return new NormalizationTransliterator(id.toString(), m, choice, opt);
|
||||
}
|
||||
|
||||
|
@ -185,7 +199,7 @@ final class NormalizationTransliterator extends Transliterator {
|
|||
}
|
||||
text.getChars(lastSafe, limit, buffer, 0);
|
||||
String input = new String(buffer, 0, len); // TODO: fix normalizer to take char[]
|
||||
String output = Normalizer.normalize(input, mode, options);
|
||||
String output = Normalizer.normalize(input, mode);
|
||||
|
||||
// verify OK, if specified
|
||||
if (verify != null) {
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -3,13 +3,13 @@
|
|||
* others. All Rights Reserved.
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TitlecaseTransliterator.java,v $
|
||||
* $Date: 2002/04/02 23:59:59 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2002/06/20 01:21:18 $
|
||||
* $Revision: 1.16 $
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
import java.util.*;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.impl.UCharacterIterator;
|
||||
import com.ibm.icu.impl.UnicodeCharacterIterator;
|
||||
|
||||
/**
|
||||
* A transliterator that converts all letters (as defined by
|
||||
|
@ -92,7 +92,7 @@ class TitlecaseTransliterator extends Transliterator {
|
|||
// get string for context
|
||||
// TODO: add convenience method to do this, since we do it all over
|
||||
|
||||
UCharacterIterator original = new UCharacterIterator(text);
|
||||
UnicodeCharacterIterator original = new UnicodeCharacterIterator(text);
|
||||
|
||||
// Walk through original string
|
||||
// If there is a case change, modify corresponding position in replaceable
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
|
||||
* $Date: 2002/04/17 16:46:11 $
|
||||
* $Revision: 1.21 $
|
||||
* $Date: 2002/06/20 01:21:18 $
|
||||
* $Revision: 1.22 $
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
@ -1334,13 +1334,13 @@ class TransliteratorParser {
|
|||
|
||||
p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
|
||||
if (p >= 0) {
|
||||
pragmaNormalizeRules(Normalizer.DECOMP);
|
||||
pragmaNormalizeRules(Normalizer.NFD);
|
||||
return p;
|
||||
}
|
||||
|
||||
p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
|
||||
if (p >= 0) {
|
||||
pragmaNormalizeRules(Normalizer.COMPOSE);
|
||||
pragmaNormalizeRules(Normalizer.NFC);
|
||||
return p;
|
||||
}
|
||||
|
||||
|
|
|
@ -32,17 +32,17 @@ public class TransliteratorUtility {
|
|||
// transliterators.
|
||||
for (Enumeration e = Transliterator.getAvailableIDs(); e.hasMoreElements(); ) {
|
||||
String ID = (String) e.nextElement();
|
||||
showSourceSet(ID, Normalizer.NO_OP, false);
|
||||
showSourceSet(ID, Normalizer.NONE, false);
|
||||
}
|
||||
} else {
|
||||
// Usage: ID [NFKD | NFD] [lower]
|
||||
Normalizer.Mode m = Normalizer.NO_OP;
|
||||
Normalizer.Mode m = Normalizer.NONE;
|
||||
boolean lowerFirst = false;
|
||||
if (args.length >= 2) {
|
||||
if (args[1].equalsIgnoreCase("NFD")) {
|
||||
m = Normalizer.DECOMP;
|
||||
m = Normalizer.NFD;
|
||||
} else if (args[1].equalsIgnoreCase("NFKD")) {
|
||||
m = Normalizer.DECOMP_COMPAT;
|
||||
m = Normalizer.NFKD;
|
||||
} else {
|
||||
usage();
|
||||
}
|
||||
|
@ -87,7 +87,7 @@ public class TransliteratorUtility {
|
|||
|
||||
static void showSourceSetAux(Transliterator t, Normalizer.Mode m, boolean lowerFirst, boolean forward) throws IOException {
|
||||
UnicodeSet sourceSet = t.getSourceSet();
|
||||
if (m != Normalizer.NO_OP || lowerFirst) {
|
||||
if (m != Normalizer.NONE || lowerFirst) {
|
||||
UnicodeSetClosure.close(sourceSet, m, lowerFirst);
|
||||
}
|
||||
System.out.println(t.getID() + ": " +
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UTF16.java,v $
|
||||
* $Date: 2002/05/14 23:45:46 $
|
||||
* $Revision: 1.20 $
|
||||
* $Date: 2002/06/20 01:21:18 $
|
||||
* $Revision: 1.21 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -14,6 +14,7 @@
|
|||
package com.ibm.icu.text;
|
||||
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
/**
|
||||
* Standalone utility class providing UTF16 character conversions and indexing
|
||||
* conversions.
|
||||
|
@ -2213,6 +2214,35 @@ public final class UTF16
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
public int caseCompare(Object a, Object b, int options){
|
||||
if (a == b) {
|
||||
return 0;
|
||||
}
|
||||
if (a == null) {
|
||||
return -1;
|
||||
}
|
||||
if (b == null) {
|
||||
return 1;
|
||||
}
|
||||
String sa = (String) a;
|
||||
String sb = (String) b;
|
||||
int la = sa.length();
|
||||
int lb = sb.length();
|
||||
if( sa != sb ){
|
||||
int result = NormalizerImpl.cmpEquivFold(sa,sb,
|
||||
options|Normalizer.COMPARE_IGNORE_CASE);
|
||||
if(result!=0) {
|
||||
return (int)((byte)(result >> 24 | 1));
|
||||
}
|
||||
|
||||
}else{
|
||||
if(la != lb){
|
||||
return (int)((byte)((la-lb) >> 24 | 1));
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// private data members -------------------------------------------------
|
||||
|
|
|
@ -5,15 +5,15 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UppercaseTransliterator.java,v $
|
||||
* $Date: 2002/04/02 23:59:59 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/06/20 01:21:18 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
import java.util.*;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.impl.UCharacterIterator;
|
||||
import com.ibm.icu.impl.UnicodeCharacterIterator;
|
||||
|
||||
/**
|
||||
* A transliterator that performs locale-sensitive toUpper()
|
||||
|
@ -59,7 +59,7 @@ class UppercaseTransliterator extends Transliterator {
|
|||
// get string for context
|
||||
// TODO: add convenience method to do this, since we do it all over
|
||||
|
||||
UCharacterIterator original = new UCharacterIterator(text);
|
||||
UnicodeCharacterIterator original = new UnicodeCharacterIterator(text);
|
||||
|
||||
// Walk through original string
|
||||
// If there is a case change, modify corresponding position in replaceable
|
||||
|
|
Loading…
Add table
Reference in a new issue