From ed21be41c5547719638c89702e5e7ed88356f0cd Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Thu, 12 Feb 2004 06:56:28 +0000 Subject: [PATCH] performance fixes for UnicodeMap X-SVN-Rev: 14492 --- .../com/ibm/icu/dev/test/util/UnicodeMap.java | 345 ++++++++++++++---- 1 file changed, 283 insertions(+), 62 deletions(-) diff --git a/icu4j/src/com/ibm/icu/dev/test/util/UnicodeMap.java b/icu4j/src/com/ibm/icu/dev/test/util/UnicodeMap.java index bf0aca081bf..4e7dc684061 100644 --- a/icu4j/src/com/ibm/icu/dev/test/util/UnicodeMap.java +++ b/icu4j/src/com/ibm/icu/dev/test/util/UnicodeMap.java @@ -5,17 +5,174 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import com.ibm.icu.impl.Utility; import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; /** * Class for mapping Unicode characters to values * Much smaller storage than using HashMap. * @author Davis */ // TODO Optimize using range map -public class UnicodeMap { - // TODO optimize - private HashMap objectToSet = new HashMap(); - private UnicodeSet missing = new UnicodeSet(0,0x10FFFF); +public final class UnicodeMap { + static final boolean ASSERTIONS = false; + static final long GROWTH_PERCENT = 200; // 100 is no growth! + static final long GROWTH_GAP = 10; // extra bump! + + private int length = 2; + private int[] transitions = {0,0x110000,0,0,0,0,0,0,0,0}; + private Object[] values = new Object[10]; + { + values[1] = "TERMINAL"; + } + + void _checkInvariants() { + if (length < 2 + || length > transitions.length + || transitions.length != values.length) { + throw new IllegalArgumentException("Invariant failed: Lengths bad"); + } + for (int i = 1; i < length-1; ++i) { + if (equator.isEqual(values[i-1], values[i])) { + throw new IllegalArgumentException("Invariant failed: values shared at " + + "\t" + Utility.hex(i-1) + ": <" + values[i-1] + ">" + + "\t" + Utility.hex(i) + ": <" + values[i] + ">" + ); + } + } + if (transitions[0] != 0 || transitions[length-1] != 0x110000) { + throw new IllegalArgumentException("Invariant failed: bounds set wrong"); + } + for (int i = 1; i < length-1; ++i) { + if (transitions[i-1] >= transitions[i]) { + throw new IllegalArgumentException("Invariant failed: not monotonic" + + "\t" + Utility.hex(i-1) + ": " + transitions[i-1] + + "\t" + Utility.hex(i) + ": " + transitions[i] + ); + } + } + } + + public interface Equator { + /** + * Comparator function. If overridden, must handle case of null, + * and compare any two objects in the array + * @param a + * @param b + * @return + */ + public boolean isEqual(Object a, Object b); + } + + public static class SimpleEquator implements Equator { + public boolean isEqual(Object a, Object b) { + if (a == b) return true; + if (a == null || b == null) return false; + return a.equals(b); + } + } + private Equator equator = new SimpleEquator(); + + /** + * Finds an index such that inversionList[i] <= codepoint < inversionList[i+1] + * Assumes that 0 <= codepoint <= 0x10FFFF + * @param codepoint + * @return + */ + private int findIndex(int c) { + int lo = 0; + int hi = length - 1; + int i = (lo + hi) >>> 1; + // invariant: c >= list[lo] + // invariant: c < list[hi] + while (i != lo) { + if (c < transitions[i]) { + hi = i; + } else { + lo = i; + } + i = (lo + hi) >>> 1; + } + if (ASSERTIONS) _checkFind(c, lo); + return lo; + } + + private void _checkFind(int codepoint, int value) { + int other = _findIndex(codepoint); + if (other != value) { + throw new IllegalArgumentException("Invariant failed: binary search" + + "\t" + Utility.hex(codepoint) + ": " + value + + "\tshould be: " + other); + } + } + + private int _findIndex(int codepoint) { + // TODO use binary search + for (int i = length-1; i > 0; --i) { + if (transitions[i] <= codepoint) return i; + } + return 0; + } + + /* + * Try indexed lookup + + static final int SHIFT = 8; + int[] starts = new int[0x10FFFF>>SHIFT]; // lowest transition index where codepoint>>x can be found + boolean startsValid = false; + private int findIndex(int codepoint) { + if (!startsValid) { + int start = 0; + for (int i = 1; i < length; ++i) { + + } + } + for (int i = length-1; i > 0; --i) { + if (transitions[i] <= codepoint) return i; + } + return 0; + } + */ + + /** + * Remove the items from index through index+count-1. + * Logically reduces the size of the internal arrays. + * @param index + * @param count + */ + private void removeAt(int index, int count) { + for (int i = index + count; i < length; ++i) { + transitions[i-count] = transitions[i]; + values[i-count] = values[i]; + } + length -= count; + } + /** + * Add a gap from index to index+count-1. + * The values there are undefined, and must be set. + * Logically grows arrays to accomodate. Actual growth is limited + * @param index + * @param count + */ + private void insertGapAt(int index, int count) { + int newLength = length + count; + int[] oldtransitions = transitions; + Object[] oldvalues = values; + if (newLength > transitions.length) { + int allocation = (int) (GROWTH_GAP + (newLength * GROWTH_PERCENT) / 100); + transitions = new int[allocation]; + values = new Object[allocation]; + for (int i = 0; i < index; ++i) { + transitions[i] = oldtransitions[i]; + values[i] = oldvalues[i]; + } + } + for (int i = length - 1; i >= index; --i) { + transitions[i+count] = oldtransitions[i]; + values[i+count] = oldvalues[i]; + } + length = newLength; + } /** * Associates code point with value. Removes any previous association. @@ -23,49 +180,102 @@ public class UnicodeMap { * @param value * @return this, for chaining */ - public UnicodeMap put(int codepoint, Object value) { - if (!missing.contains(codepoint)) { - // remove from wherever it is. - Iterator it = objectToSet.keySet().iterator(); - while (it.hasNext()) { - UnicodeSet set = (UnicodeSet) objectToSet.get(it.next()); - if (set.contains(codepoint)) { - set.remove(codepoint); - break; + private UnicodeMap _put(int codepoint, Object value) { + int baseIndex = findIndex(codepoint); + int limitIndex = baseIndex + 1; + // cases are (a) value is already set + if (equator.isEqual(values[baseIndex], value)) return this; + int baseCP = transitions[baseIndex]; + int limitCP = transitions[limitIndex]; + // CASE: At very start of range + if (baseCP == codepoint) { + boolean connectsWithPrevious = + baseIndex != 0 && equator.isEqual(value, values[baseIndex-1]); + + // CASE: Single codepoint range + if (limitCP == codepoint + 1) { + boolean connectsWithFollowing = + baseIndex < length - 1 && equator.isEqual(value, values[limitIndex]); + // A1a connects with previous & following, so remove index + if (connectsWithPrevious) { + if (connectsWithFollowing) { + removeAt(baseIndex, 2); + return this; + } + removeAt(baseIndex, 1); // extend previous + return this; + } else if (connectsWithFollowing) { + removeAt(baseIndex, 1); // extend following backwards + transitions[baseIndex] = codepoint; + return this; } + // doesn't connect on either side, just reset + values[baseIndex] = value; + return this; + } + // A.1: start of multi codepoint range + // if connects + if (connectsWithPrevious) { + ++transitions[baseIndex]; // extend previous + } else { + // otherwise insert new transition + transitions[baseIndex] = codepoint+1; // fix following range + insertGapAt(baseIndex, 1); + values[baseIndex] = value; + transitions[baseIndex] = codepoint; } - missing.remove(codepoint); + return this; } - UnicodeSet set = (UnicodeSet) objectToSet.get(value); - if (set == null) { - set = new UnicodeSet(); - objectToSet.put(value,set); + // CASE: at end of range + if (limitCP == codepoint + 1) { + // if connects, just back up range + boolean connectsWithFollowing = + baseIndex < length - 1 && equator.isEqual(value, values[limitIndex]); + + if (connectsWithFollowing) { + --transitions[limitIndex]; + return this; + } else { + insertGapAt(limitIndex, 1); + transitions[limitIndex] = codepoint; + values[limitIndex] = value; + } + return this; } - set.add(codepoint); + // CASE: in middle of range + insertGapAt(++baseIndex,2); + transitions[baseIndex] = codepoint; + values[baseIndex] = value; + transitions[++baseIndex] = codepoint + 1; + values[baseIndex] = values[baseIndex-2]; // copy lower range values return this; } /** - * Adds bunch o' codepoints; otherwise like add. + * Sets the codepoint value. + * @param codepoint + * @param value + * @return + */ + public UnicodeMap put(int codepoint, Object value) { + if (codepoint < 0 || codepoint > 0x10FFFF) { + throw new IllegalArgumentException("Codepoint out of range: " + codepoint); + } + _put(codepoint, value); + if (ASSERTIONS) _checkInvariants(); + return this; + } + /** + * Adds bunch o' codepoints; otherwise like put. * @param codepoints * @param value * @return this, for chaining */ public UnicodeMap putAll(UnicodeSet codepoints, Object value) { - if (!missing.containsAll(codepoints)) { - // remove from wherever it is. - Iterator it = objectToSet.keySet().iterator(); - while (it.hasNext()) { - UnicodeSet set = (UnicodeSet) objectToSet.get(it.next()); - set.removeAll(codepoints); - } + // TODO optimize + UnicodeSetIterator it = new UnicodeSetIterator(codepoints); + while (it.next()) { + _put(it.codepoint, value); } - missing.removeAll(codepoints); - UnicodeSet set = (UnicodeSet) objectToSet.get(value); - if (set == null) { - set = new UnicodeSet(); - objectToSet.put(value,set); - } - set.addAll(codepoints); return this; } @@ -76,8 +286,15 @@ public class UnicodeMap { * @return this, for chaining */ public UnicodeMap putAll(int startCodePoint, int endCodePoint, Object value) { + if (startCodePoint < 0 || endCodePoint > 0x10FFFF) { + throw new IllegalArgumentException("Codepoint out of range: " + + Utility.hex(startCodePoint) + ".." + Utility.hex(endCodePoint)); + } // TODO optimize - return putAll(new UnicodeSet(startCodePoint, endCodePoint), value); + for (int i = startCodePoint; i <= endCodePoint; ++i) { + _put(i, value); + } + return this; } /** * Add all the (main) values from a Unicode property @@ -85,23 +302,22 @@ public class UnicodeMap { * @return */ public UnicodeMap putAll(UnicodeProperty prop) { - UnicodeSet temp = new UnicodeSet(); - Iterator it = prop.getAliases().iterator(); - while(it.hasNext()) { - String value = (String) it.next(); - temp.clear(); - putAll(prop.getSet(value,temp), value); + // TODO optimize + for (int i = 0; i <= 0x10FFFF; ++i) { + _put(i, prop.getValue(i)); } - return null; + return this; } /** * Set the currently unmapped Unicode code points to the given value. * @param value * @return - */public UnicodeMap setMissing(Object value) { - objectToSet.put(value,missing); - missing = new UnicodeSet(); + */ + public UnicodeMap setMissing(Object value) { + for (int i = 0; i < length; ++i) { + if (values[i] == null) values[i] = value; + } return this; } /** @@ -114,8 +330,9 @@ public class UnicodeMap { */ public UnicodeSet getSet(Object value, UnicodeSet result) { if (result == null) result = new UnicodeSet(); - UnicodeSet set = (UnicodeSet) objectToSet.get(value); - if (set != null) result.addAll(set); + for (int i = 0; i < length; ++i) { + if (values[i] == value) result.add(transitions[i], transitions[i+1]); + } return result; } /** @@ -126,7 +343,12 @@ public class UnicodeMap { */ public Collection getAvailableValues(Collection result) { if (result == null) result = new HashSet(); - result.addAll(objectToSet.keySet()); + for (int i = 0; i < length; ++i) { + Object value = values[i]; + if (value == null) continue; + if (result.contains(value)) continue; + result.add(value); + } return result; } /** @@ -136,25 +358,24 @@ public class UnicodeMap { * @return */ public Object getValue(int codepoint) { - if (missing.contains(codepoint)) return null; - Iterator it = objectToSet.keySet().iterator(); - while (it.hasNext()) { - Object value = it.next(); - UnicodeSet set = (UnicodeSet) objectToSet.get(value); - if (set.contains(codepoint)) return value; + if (codepoint < 0 || codepoint > 0x10FFFF) { + throw new IllegalArgumentException("Codepoint out of range: " + codepoint); } - return null; + return values[findIndex(codepoint)]; } public String toString() { StringBuffer result = new StringBuffer(); - Iterator it = objectToSet.keySet().iterator(); - while (it.hasNext()) { - Object value = it.next(); - UnicodeSet set = (UnicodeSet) objectToSet.get(value); - result.append(value) - .append("=>") - .append(set.toPattern(true)) + for (int i = 0; i < length-1; ++i) { + Object value = values[i]; + if (value == null) continue; + int start = transitions[i]; + int end = transitions[i+1]-1; + result.append(Utility.hex(start)); + if (start != end) result.append("..") + .append(Utility.hex(end)); + result.append("\t=>") + .append(values[i] == null ? "null" : values[i].toString()) .append("\r\n"); } return result.toString();