ICU-4465 fixed canonical iterator

X-SVN-Rev: 17469
This commit is contained in:
Mark Davis 2005-04-06 15:15:43 +00:00
parent ac3cc9119b
commit 49305fcad3
2 changed files with 38 additions and 36 deletions

View file

@ -117,29 +117,30 @@ public final class CanonicalIterator {
}
// find the segments
List list = new ArrayList();
List segmentList = new ArrayList();
int cp;
int start = 0;
// i should be the end of the first code point
// break up the string into segements
int i = UTF16.findOffsetFromCodePoint(source, 1);
for (; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
if (NormalizerImpl.isCanonSafeStart(cp)) {
list.add(source.substring(start, i)); // add up to i
segmentList.add(source.substring(start, i)); // add up to i
start = i;
}
}
list.add(source.substring(start, i)); // add last one
segmentList.add(source.substring(start, i)); // add last one
// allocate the arrays, and find the strings that are CE to each segment
pieces = new String[list.size()][];
current = new int[list.size()];
pieces = new String[segmentList.size()][];
current = new int[segmentList.size()];
for (i = 0; i < pieces.length; ++i) {
if (PROGRESS) System.out.println("SEGMENT");
pieces[i] = getEquivalents((String) list.get(i));
pieces[i] = getEquivalents((String) segmentList.get(i));
}
}
@ -285,12 +286,12 @@ public final class CanonicalIterator {
StringBuffer workingBuffer = new StringBuffer();
// cycle through all the characters
int cp=0,end=0;
int cp=0;
int[] range = new int[2];
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
// see if any character is at the start of some decomposition
cp = UTF16.charAt(segment, i);;
cp = UTF16.charAt(segment, i);
USerializedSet starts = new USerializedSet();
if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
@ -298,28 +299,27 @@ public final class CanonicalIterator {
}
int j=0;
// if so, see which decompositions match
for(j = 0, cp = end+1; cp <= end ||starts.getRange(j++, range); ++cp) {
if(cp>end){
cp=range[0];
end=range[1];
int rangeCount = starts.countRanges();
for(j = 0; j < rangeCount; ++j) {
starts.getRange(j, range);
int end=range[1];
for (int cp2 = range[0]; cp2 <= end; ++cp2) {
Set remainder = extract(cp2, segment, i, workingBuffer);
if (remainder == null) continue;
// there were some matches, so add all the possibilities to the set.
String prefix= segment.substring(0,i);
prefix += UTF16.valueOf(cp2);
//int el = -1;
Iterator iter = remainder.iterator();
while (iter.hasNext()) {
String item = (String) iter.next();
String toAdd = new String(prefix);
toAdd += item;
result.add(toAdd);
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
}
}
Set remainder = extract(cp, segment, i,workingBuffer);
if (remainder == null) continue;
// there were some matches, so add all the possibilities to the set.
String prefix= segment.substring(0,i);
prefix += UTF16.valueOf(cp);
//int el = -1;
Iterator iter = remainder.iterator();
while (iter.hasNext()) {
String item = (String) iter.next();
String toAdd = new String(prefix);
toAdd += item;
result.add(toAdd);
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
}
}
}
return result;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
* $Date: 2005/04/06 08:48:16 $
* $Revision: 1.19 $
* $Date: 2005/04/06 15:15:43 $
* $Revision: 1.20 $
*
*******************************************************************************
*/
@ -33,7 +33,6 @@ public class Main {
};
public static void main(String args[]) throws Exception {
//checkCanonicalIterator();
// NOTE: so far, we don't need to build the UCA with anything but the latest versions.
// A few changes would need to be made to the code to do older versions.
try {
@ -101,6 +100,9 @@ public class Main {
else if (arg.equalsIgnoreCase("short")) shortPrint = !shortPrint;
else if (arg.equalsIgnoreCase("noCE")) noCE = !noCE;
else if (arg.equalsIgnoreCase("checkCanonicalIterator")) checkCanonicalIterator();
else if (arg.equalsIgnoreCase("writeAllocation")) WriteCharts.writeAllocation();
// else if (arg.equalsIgnoreCase("probe")) Probe.test();
@ -148,16 +150,16 @@ public class Main {
System.out.println("first implicit: " + Utility.hex((long)(firstImplicit & 0xFFFFFFFFL)));
CanonicalIterator it = new CanonicalIterator("");
String[] tests = new String[] {"\uF900"};
String[] tests = new String[] {"\uF900", "\u00C5d\u0307\u0327"};
for (int j = 0; j < tests.length; ++j) {
System.out.println(tests[j]);
System.out.println(Default.ucd().getCodeAndName(tests[j]));
it.setSource(tests[j]);
String ss;
for (int i = 0; (ss = it.next()) != null; ++i) {
System.out.println(i + "\t" + Utility.hex(ss));
System.out.println(i + "\t" + Default.ucd().getCodeAndName(ss));
}
}
if (true) throw new IllegalArgumentException();
// verify that nothing breaks
for (int i = 0; i < 0x10FFFF; ++i) {
int cat = UCharacter.getType(i);
if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue;