mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-4465 fixed canonical iterator
X-SVN-Rev: 17469
This commit is contained in:
parent
ac3cc9119b
commit
49305fcad3
2 changed files with 38 additions and 36 deletions
|
@ -117,29 +117,30 @@ public final class CanonicalIterator {
|
|||
}
|
||||
|
||||
// find the segments
|
||||
List list = new ArrayList();
|
||||
List segmentList = new ArrayList();
|
||||
int cp;
|
||||
int start = 0;
|
||||
|
||||
// i should be the end of the first code point
|
||||
// break up the string into segements
|
||||
|
||||
int i = UTF16.findOffsetFromCodePoint(source, 1);
|
||||
|
||||
for (; i < source.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
if (NormalizerImpl.isCanonSafeStart(cp)) {
|
||||
list.add(source.substring(start, i)); // add up to i
|
||||
segmentList.add(source.substring(start, i)); // add up to i
|
||||
start = i;
|
||||
}
|
||||
}
|
||||
list.add(source.substring(start, i)); // add last one
|
||||
segmentList.add(source.substring(start, i)); // add last one
|
||||
|
||||
// allocate the arrays, and find the strings that are CE to each segment
|
||||
pieces = new String[list.size()][];
|
||||
current = new int[list.size()];
|
||||
pieces = new String[segmentList.size()][];
|
||||
current = new int[segmentList.size()];
|
||||
for (i = 0; i < pieces.length; ++i) {
|
||||
if (PROGRESS) System.out.println("SEGMENT");
|
||||
pieces[i] = getEquivalents((String) list.get(i));
|
||||
pieces[i] = getEquivalents((String) segmentList.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -285,12 +286,12 @@ public final class CanonicalIterator {
|
|||
StringBuffer workingBuffer = new StringBuffer();
|
||||
|
||||
// cycle through all the characters
|
||||
int cp=0,end=0;
|
||||
int cp=0;
|
||||
int[] range = new int[2];
|
||||
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
|
||||
|
||||
// see if any character is at the start of some decomposition
|
||||
cp = UTF16.charAt(segment, i);;
|
||||
cp = UTF16.charAt(segment, i);
|
||||
USerializedSet starts = new USerializedSet();
|
||||
|
||||
if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
|
||||
|
@ -298,28 +299,27 @@ public final class CanonicalIterator {
|
|||
}
|
||||
int j=0;
|
||||
// if so, see which decompositions match
|
||||
for(j = 0, cp = end+1; cp <= end ||starts.getRange(j++, range); ++cp) {
|
||||
if(cp>end){
|
||||
cp=range[0];
|
||||
end=range[1];
|
||||
int rangeCount = starts.countRanges();
|
||||
for(j = 0; j < rangeCount; ++j) {
|
||||
starts.getRange(j, range);
|
||||
int end=range[1];
|
||||
for (int cp2 = range[0]; cp2 <= end; ++cp2) {
|
||||
Set remainder = extract(cp2, segment, i, workingBuffer);
|
||||
if (remainder == null) continue;
|
||||
|
||||
// there were some matches, so add all the possibilities to the set.
|
||||
String prefix= segment.substring(0,i);
|
||||
prefix += UTF16.valueOf(cp2);
|
||||
//int el = -1;
|
||||
Iterator iter = remainder.iterator();
|
||||
while (iter.hasNext()) {
|
||||
String item = (String) iter.next();
|
||||
String toAdd = new String(prefix);
|
||||
toAdd += item;
|
||||
result.add(toAdd);
|
||||
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
|
||||
}
|
||||
}
|
||||
|
||||
Set remainder = extract(cp, segment, i,workingBuffer);
|
||||
if (remainder == null) continue;
|
||||
|
||||
// there were some matches, so add all the possibilities to the set.
|
||||
String prefix= segment.substring(0,i);
|
||||
prefix += UTF16.valueOf(cp);
|
||||
//int el = -1;
|
||||
Iterator iter = remainder.iterator();
|
||||
while (iter.hasNext()) {
|
||||
String item = (String) iter.next();
|
||||
String toAdd = new String(prefix);
|
||||
toAdd += item;
|
||||
result.add(toAdd);
|
||||
//if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
|
||||
* $Date: 2005/04/06 08:48:16 $
|
||||
* $Revision: 1.19 $
|
||||
* $Date: 2005/04/06 15:15:43 $
|
||||
* $Revision: 1.20 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -33,7 +33,6 @@ public class Main {
|
|||
};
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
//checkCanonicalIterator();
|
||||
// NOTE: so far, we don't need to build the UCA with anything but the latest versions.
|
||||
// A few changes would need to be made to the code to do older versions.
|
||||
try {
|
||||
|
@ -101,6 +100,9 @@ public class Main {
|
|||
else if (arg.equalsIgnoreCase("short")) shortPrint = !shortPrint;
|
||||
else if (arg.equalsIgnoreCase("noCE")) noCE = !noCE;
|
||||
|
||||
else if (arg.equalsIgnoreCase("checkCanonicalIterator")) checkCanonicalIterator();
|
||||
|
||||
|
||||
else if (arg.equalsIgnoreCase("writeAllocation")) WriteCharts.writeAllocation();
|
||||
// else if (arg.equalsIgnoreCase("probe")) Probe.test();
|
||||
|
||||
|
@ -148,16 +150,16 @@ public class Main {
|
|||
System.out.println("first implicit: " + Utility.hex((long)(firstImplicit & 0xFFFFFFFFL)));
|
||||
|
||||
CanonicalIterator it = new CanonicalIterator("");
|
||||
String[] tests = new String[] {"\uF900"};
|
||||
String[] tests = new String[] {"\uF900", "\u00C5d\u0307\u0327"};
|
||||
for (int j = 0; j < tests.length; ++j) {
|
||||
System.out.println(tests[j]);
|
||||
System.out.println(Default.ucd().getCodeAndName(tests[j]));
|
||||
it.setSource(tests[j]);
|
||||
String ss;
|
||||
for (int i = 0; (ss = it.next()) != null; ++i) {
|
||||
System.out.println(i + "\t" + Utility.hex(ss));
|
||||
System.out.println(i + "\t" + Default.ucd().getCodeAndName(ss));
|
||||
}
|
||||
}
|
||||
if (true) throw new IllegalArgumentException();
|
||||
// verify that nothing breaks
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
int cat = UCharacter.getType(i);
|
||||
if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue;
|
||||
|
|
Loading…
Add table
Reference in a new issue