performance fixes for UnicodeMap

X-SVN-Rev: 14492
This commit is contained in:
Mark Davis 2004-02-12 06:56:28 +00:00
parent 0af1b1826b
commit ed21be41c5

View file

@ -5,17 +5,174 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.text.UnicodeSetIterator;
/**
* Class for mapping Unicode characters to values
* Much smaller storage than using HashMap.
* @author Davis
*/
// TODO Optimize using range map
public class UnicodeMap {
// TODO optimize
private HashMap objectToSet = new HashMap();
private UnicodeSet missing = new UnicodeSet(0,0x10FFFF);
public final class UnicodeMap {
static final boolean ASSERTIONS = false;
static final long GROWTH_PERCENT = 200; // 100 is no growth!
static final long GROWTH_GAP = 10; // extra bump!
private int length = 2;
private int[] transitions = {0,0x110000,0,0,0,0,0,0,0,0};
private Object[] values = new Object[10];
{
values[1] = "TERMINAL";
}
void _checkInvariants() {
if (length < 2
|| length > transitions.length
|| transitions.length != values.length) {
throw new IllegalArgumentException("Invariant failed: Lengths bad");
}
for (int i = 1; i < length-1; ++i) {
if (equator.isEqual(values[i-1], values[i])) {
throw new IllegalArgumentException("Invariant failed: values shared at "
+ "\t" + Utility.hex(i-1) + ": <" + values[i-1] + ">"
+ "\t" + Utility.hex(i) + ": <" + values[i] + ">"
);
}
}
if (transitions[0] != 0 || transitions[length-1] != 0x110000) {
throw new IllegalArgumentException("Invariant failed: bounds set wrong");
}
for (int i = 1; i < length-1; ++i) {
if (transitions[i-1] >= transitions[i]) {
throw new IllegalArgumentException("Invariant failed: not monotonic"
+ "\t" + Utility.hex(i-1) + ": " + transitions[i-1]
+ "\t" + Utility.hex(i) + ": " + transitions[i]
);
}
}
}
public interface Equator {
/**
* Comparator function. If overridden, must handle case of null,
* and compare any two objects in the array
* @param a
* @param b
* @return
*/
public boolean isEqual(Object a, Object b);
}
public static class SimpleEquator implements Equator {
public boolean isEqual(Object a, Object b) {
if (a == b) return true;
if (a == null || b == null) return false;
return a.equals(b);
}
}
private Equator equator = new SimpleEquator();
/**
* Finds an index such that inversionList[i] <= codepoint < inversionList[i+1]
* Assumes that 0 <= codepoint <= 0x10FFFF
* @param codepoint
* @return
*/
private int findIndex(int c) {
int lo = 0;
int hi = length - 1;
int i = (lo + hi) >>> 1;
// invariant: c >= list[lo]
// invariant: c < list[hi]
while (i != lo) {
if (c < transitions[i]) {
hi = i;
} else {
lo = i;
}
i = (lo + hi) >>> 1;
}
if (ASSERTIONS) _checkFind(c, lo);
return lo;
}
private void _checkFind(int codepoint, int value) {
int other = _findIndex(codepoint);
if (other != value) {
throw new IllegalArgumentException("Invariant failed: binary search"
+ "\t" + Utility.hex(codepoint) + ": " + value
+ "\tshould be: " + other);
}
}
private int _findIndex(int codepoint) {
// TODO use binary search
for (int i = length-1; i > 0; --i) {
if (transitions[i] <= codepoint) return i;
}
return 0;
}
/*
* Try indexed lookup
static final int SHIFT = 8;
int[] starts = new int[0x10FFFF>>SHIFT]; // lowest transition index where codepoint>>x can be found
boolean startsValid = false;
private int findIndex(int codepoint) {
if (!startsValid) {
int start = 0;
for (int i = 1; i < length; ++i) {
}
}
for (int i = length-1; i > 0; --i) {
if (transitions[i] <= codepoint) return i;
}
return 0;
}
*/
/**
* Remove the items from index through index+count-1.
* Logically reduces the size of the internal arrays.
* @param index
* @param count
*/
private void removeAt(int index, int count) {
for (int i = index + count; i < length; ++i) {
transitions[i-count] = transitions[i];
values[i-count] = values[i];
}
length -= count;
}
/**
* Add a gap from index to index+count-1.
* The values there are undefined, and must be set.
* Logically grows arrays to accomodate. Actual growth is limited
* @param index
* @param count
*/
private void insertGapAt(int index, int count) {
int newLength = length + count;
int[] oldtransitions = transitions;
Object[] oldvalues = values;
if (newLength > transitions.length) {
int allocation = (int) (GROWTH_GAP + (newLength * GROWTH_PERCENT) / 100);
transitions = new int[allocation];
values = new Object[allocation];
for (int i = 0; i < index; ++i) {
transitions[i] = oldtransitions[i];
values[i] = oldvalues[i];
}
}
for (int i = length - 1; i >= index; --i) {
transitions[i+count] = oldtransitions[i];
values[i+count] = oldvalues[i];
}
length = newLength;
}
/**
* Associates code point with value. Removes any previous association.
@ -23,49 +180,102 @@ public class UnicodeMap {
* @param value
* @return this, for chaining
*/
public UnicodeMap put(int codepoint, Object value) {
if (!missing.contains(codepoint)) {
// remove from wherever it is.
Iterator it = objectToSet.keySet().iterator();
while (it.hasNext()) {
UnicodeSet set = (UnicodeSet) objectToSet.get(it.next());
if (set.contains(codepoint)) {
set.remove(codepoint);
break;
private UnicodeMap _put(int codepoint, Object value) {
int baseIndex = findIndex(codepoint);
int limitIndex = baseIndex + 1;
// cases are (a) value is already set
if (equator.isEqual(values[baseIndex], value)) return this;
int baseCP = transitions[baseIndex];
int limitCP = transitions[limitIndex];
// CASE: At very start of range
if (baseCP == codepoint) {
boolean connectsWithPrevious =
baseIndex != 0 && equator.isEqual(value, values[baseIndex-1]);
// CASE: Single codepoint range
if (limitCP == codepoint + 1) {
boolean connectsWithFollowing =
baseIndex < length - 1 && equator.isEqual(value, values[limitIndex]);
// A1a connects with previous & following, so remove index
if (connectsWithPrevious) {
if (connectsWithFollowing) {
removeAt(baseIndex, 2);
return this;
}
removeAt(baseIndex, 1); // extend previous
return this;
} else if (connectsWithFollowing) {
removeAt(baseIndex, 1); // extend following backwards
transitions[baseIndex] = codepoint;
return this;
}
// doesn't connect on either side, just reset
values[baseIndex] = value;
return this;
}
// A.1: start of multi codepoint range
// if connects
if (connectsWithPrevious) {
++transitions[baseIndex]; // extend previous
} else {
// otherwise insert new transition
transitions[baseIndex] = codepoint+1; // fix following range
insertGapAt(baseIndex, 1);
values[baseIndex] = value;
transitions[baseIndex] = codepoint;
}
missing.remove(codepoint);
return this;
}
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
if (set == null) {
set = new UnicodeSet();
objectToSet.put(value,set);
// CASE: at end of range
if (limitCP == codepoint + 1) {
// if connects, just back up range
boolean connectsWithFollowing =
baseIndex < length - 1 && equator.isEqual(value, values[limitIndex]);
if (connectsWithFollowing) {
--transitions[limitIndex];
return this;
} else {
insertGapAt(limitIndex, 1);
transitions[limitIndex] = codepoint;
values[limitIndex] = value;
}
return this;
}
set.add(codepoint);
// CASE: in middle of range
insertGapAt(++baseIndex,2);
transitions[baseIndex] = codepoint;
values[baseIndex] = value;
transitions[++baseIndex] = codepoint + 1;
values[baseIndex] = values[baseIndex-2]; // copy lower range values
return this;
}
/**
* Adds bunch o' codepoints; otherwise like add.
* Sets the codepoint value.
* @param codepoint
* @param value
* @return
*/
public UnicodeMap put(int codepoint, Object value) {
if (codepoint < 0 || codepoint > 0x10FFFF) {
throw new IllegalArgumentException("Codepoint out of range: " + codepoint);
}
_put(codepoint, value);
if (ASSERTIONS) _checkInvariants();
return this;
}
/**
* Adds bunch o' codepoints; otherwise like put.
* @param codepoints
* @param value
* @return this, for chaining
*/
public UnicodeMap putAll(UnicodeSet codepoints, Object value) {
if (!missing.containsAll(codepoints)) {
// remove from wherever it is.
Iterator it = objectToSet.keySet().iterator();
while (it.hasNext()) {
UnicodeSet set = (UnicodeSet) objectToSet.get(it.next());
set.removeAll(codepoints);
}
// TODO optimize
UnicodeSetIterator it = new UnicodeSetIterator(codepoints);
while (it.next()) {
_put(it.codepoint, value);
}
missing.removeAll(codepoints);
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
if (set == null) {
set = new UnicodeSet();
objectToSet.put(value,set);
}
set.addAll(codepoints);
return this;
}
@ -76,8 +286,15 @@ public class UnicodeMap {
* @return this, for chaining
*/
public UnicodeMap putAll(int startCodePoint, int endCodePoint, Object value) {
if (startCodePoint < 0 || endCodePoint > 0x10FFFF) {
throw new IllegalArgumentException("Codepoint out of range: "
+ Utility.hex(startCodePoint) + ".." + Utility.hex(endCodePoint));
}
// TODO optimize
return putAll(new UnicodeSet(startCodePoint, endCodePoint), value);
for (int i = startCodePoint; i <= endCodePoint; ++i) {
_put(i, value);
}
return this;
}
/**
* Add all the (main) values from a Unicode property
@ -85,23 +302,22 @@ public class UnicodeMap {
* @return
*/
public UnicodeMap putAll(UnicodeProperty prop) {
UnicodeSet temp = new UnicodeSet();
Iterator it = prop.getAliases().iterator();
while(it.hasNext()) {
String value = (String) it.next();
temp.clear();
putAll(prop.getSet(value,temp), value);
// TODO optimize
for (int i = 0; i <= 0x10FFFF; ++i) {
_put(i, prop.getValue(i));
}
return null;
return this;
}
/**
* Set the currently unmapped Unicode code points to the given value.
* @param value
* @return
*/public UnicodeMap setMissing(Object value) {
objectToSet.put(value,missing);
missing = new UnicodeSet();
*/
public UnicodeMap setMissing(Object value) {
for (int i = 0; i < length; ++i) {
if (values[i] == null) values[i] = value;
}
return this;
}
/**
@ -114,8 +330,9 @@ public class UnicodeMap {
*/
public UnicodeSet getSet(Object value, UnicodeSet result) {
if (result == null) result = new UnicodeSet();
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
if (set != null) result.addAll(set);
for (int i = 0; i < length; ++i) {
if (values[i] == value) result.add(transitions[i], transitions[i+1]);
}
return result;
}
/**
@ -126,7 +343,12 @@ public class UnicodeMap {
*/
public Collection getAvailableValues(Collection result) {
if (result == null) result = new HashSet();
result.addAll(objectToSet.keySet());
for (int i = 0; i < length; ++i) {
Object value = values[i];
if (value == null) continue;
if (result.contains(value)) continue;
result.add(value);
}
return result;
}
/**
@ -136,25 +358,24 @@ public class UnicodeMap {
* @return
*/
public Object getValue(int codepoint) {
if (missing.contains(codepoint)) return null;
Iterator it = objectToSet.keySet().iterator();
while (it.hasNext()) {
Object value = it.next();
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
if (set.contains(codepoint)) return value;
if (codepoint < 0 || codepoint > 0x10FFFF) {
throw new IllegalArgumentException("Codepoint out of range: " + codepoint);
}
return null;
return values[findIndex(codepoint)];
}
public String toString() {
StringBuffer result = new StringBuffer();
Iterator it = objectToSet.keySet().iterator();
while (it.hasNext()) {
Object value = it.next();
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
result.append(value)
.append("=>")
.append(set.toPattern(true))
for (int i = 0; i < length-1; ++i) {
Object value = values[i];
if (value == null) continue;
int start = transitions[i];
int end = transitions[i+1]-1;
result.append(Utility.hex(start));
if (start != end) result.append("..")
.append(Utility.hex(end));
result.append("\t=>")
.append(values[i] == null ? "null" : values[i].toString())
.append("\r\n");
}
return result.toString();