mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
performance fixes for UnicodeMap
X-SVN-Rev: 14492
This commit is contained in:
parent
0af1b1826b
commit
ed21be41c5
1 changed files with 283 additions and 62 deletions
|
@ -5,17 +5,174 @@ import java.util.HashMap;
|
|||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.UnicodeSetIterator;
|
||||
/**
|
||||
* Class for mapping Unicode characters to values
|
||||
* Much smaller storage than using HashMap.
|
||||
* @author Davis
|
||||
*/
|
||||
// TODO Optimize using range map
|
||||
public class UnicodeMap {
|
||||
// TODO optimize
|
||||
private HashMap objectToSet = new HashMap();
|
||||
private UnicodeSet missing = new UnicodeSet(0,0x10FFFF);
|
||||
public final class UnicodeMap {
|
||||
static final boolean ASSERTIONS = false;
|
||||
static final long GROWTH_PERCENT = 200; // 100 is no growth!
|
||||
static final long GROWTH_GAP = 10; // extra bump!
|
||||
|
||||
private int length = 2;
|
||||
private int[] transitions = {0,0x110000,0,0,0,0,0,0,0,0};
|
||||
private Object[] values = new Object[10];
|
||||
{
|
||||
values[1] = "TERMINAL";
|
||||
}
|
||||
|
||||
void _checkInvariants() {
|
||||
if (length < 2
|
||||
|| length > transitions.length
|
||||
|| transitions.length != values.length) {
|
||||
throw new IllegalArgumentException("Invariant failed: Lengths bad");
|
||||
}
|
||||
for (int i = 1; i < length-1; ++i) {
|
||||
if (equator.isEqual(values[i-1], values[i])) {
|
||||
throw new IllegalArgumentException("Invariant failed: values shared at "
|
||||
+ "\t" + Utility.hex(i-1) + ": <" + values[i-1] + ">"
|
||||
+ "\t" + Utility.hex(i) + ": <" + values[i] + ">"
|
||||
);
|
||||
}
|
||||
}
|
||||
if (transitions[0] != 0 || transitions[length-1] != 0x110000) {
|
||||
throw new IllegalArgumentException("Invariant failed: bounds set wrong");
|
||||
}
|
||||
for (int i = 1; i < length-1; ++i) {
|
||||
if (transitions[i-1] >= transitions[i]) {
|
||||
throw new IllegalArgumentException("Invariant failed: not monotonic"
|
||||
+ "\t" + Utility.hex(i-1) + ": " + transitions[i-1]
|
||||
+ "\t" + Utility.hex(i) + ": " + transitions[i]
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public interface Equator {
|
||||
/**
|
||||
* Comparator function. If overridden, must handle case of null,
|
||||
* and compare any two objects in the array
|
||||
* @param a
|
||||
* @param b
|
||||
* @return
|
||||
*/
|
||||
public boolean isEqual(Object a, Object b);
|
||||
}
|
||||
|
||||
public static class SimpleEquator implements Equator {
|
||||
public boolean isEqual(Object a, Object b) {
|
||||
if (a == b) return true;
|
||||
if (a == null || b == null) return false;
|
||||
return a.equals(b);
|
||||
}
|
||||
}
|
||||
private Equator equator = new SimpleEquator();
|
||||
|
||||
/**
|
||||
* Finds an index such that inversionList[i] <= codepoint < inversionList[i+1]
|
||||
* Assumes that 0 <= codepoint <= 0x10FFFF
|
||||
* @param codepoint
|
||||
* @return
|
||||
*/
|
||||
private int findIndex(int c) {
|
||||
int lo = 0;
|
||||
int hi = length - 1;
|
||||
int i = (lo + hi) >>> 1;
|
||||
// invariant: c >= list[lo]
|
||||
// invariant: c < list[hi]
|
||||
while (i != lo) {
|
||||
if (c < transitions[i]) {
|
||||
hi = i;
|
||||
} else {
|
||||
lo = i;
|
||||
}
|
||||
i = (lo + hi) >>> 1;
|
||||
}
|
||||
if (ASSERTIONS) _checkFind(c, lo);
|
||||
return lo;
|
||||
}
|
||||
|
||||
private void _checkFind(int codepoint, int value) {
|
||||
int other = _findIndex(codepoint);
|
||||
if (other != value) {
|
||||
throw new IllegalArgumentException("Invariant failed: binary search"
|
||||
+ "\t" + Utility.hex(codepoint) + ": " + value
|
||||
+ "\tshould be: " + other);
|
||||
}
|
||||
}
|
||||
|
||||
private int _findIndex(int codepoint) {
|
||||
// TODO use binary search
|
||||
for (int i = length-1; i > 0; --i) {
|
||||
if (transitions[i] <= codepoint) return i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try indexed lookup
|
||||
|
||||
static final int SHIFT = 8;
|
||||
int[] starts = new int[0x10FFFF>>SHIFT]; // lowest transition index where codepoint>>x can be found
|
||||
boolean startsValid = false;
|
||||
private int findIndex(int codepoint) {
|
||||
if (!startsValid) {
|
||||
int start = 0;
|
||||
for (int i = 1; i < length; ++i) {
|
||||
|
||||
}
|
||||
}
|
||||
for (int i = length-1; i > 0; --i) {
|
||||
if (transitions[i] <= codepoint) return i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Remove the items from index through index+count-1.
|
||||
* Logically reduces the size of the internal arrays.
|
||||
* @param index
|
||||
* @param count
|
||||
*/
|
||||
private void removeAt(int index, int count) {
|
||||
for (int i = index + count; i < length; ++i) {
|
||||
transitions[i-count] = transitions[i];
|
||||
values[i-count] = values[i];
|
||||
}
|
||||
length -= count;
|
||||
}
|
||||
/**
|
||||
* Add a gap from index to index+count-1.
|
||||
* The values there are undefined, and must be set.
|
||||
* Logically grows arrays to accomodate. Actual growth is limited
|
||||
* @param index
|
||||
* @param count
|
||||
*/
|
||||
private void insertGapAt(int index, int count) {
|
||||
int newLength = length + count;
|
||||
int[] oldtransitions = transitions;
|
||||
Object[] oldvalues = values;
|
||||
if (newLength > transitions.length) {
|
||||
int allocation = (int) (GROWTH_GAP + (newLength * GROWTH_PERCENT) / 100);
|
||||
transitions = new int[allocation];
|
||||
values = new Object[allocation];
|
||||
for (int i = 0; i < index; ++i) {
|
||||
transitions[i] = oldtransitions[i];
|
||||
values[i] = oldvalues[i];
|
||||
}
|
||||
}
|
||||
for (int i = length - 1; i >= index; --i) {
|
||||
transitions[i+count] = oldtransitions[i];
|
||||
values[i+count] = oldvalues[i];
|
||||
}
|
||||
length = newLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Associates code point with value. Removes any previous association.
|
||||
|
@ -23,49 +180,102 @@ public class UnicodeMap {
|
|||
* @param value
|
||||
* @return this, for chaining
|
||||
*/
|
||||
public UnicodeMap put(int codepoint, Object value) {
|
||||
if (!missing.contains(codepoint)) {
|
||||
// remove from wherever it is.
|
||||
Iterator it = objectToSet.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
UnicodeSet set = (UnicodeSet) objectToSet.get(it.next());
|
||||
if (set.contains(codepoint)) {
|
||||
set.remove(codepoint);
|
||||
break;
|
||||
private UnicodeMap _put(int codepoint, Object value) {
|
||||
int baseIndex = findIndex(codepoint);
|
||||
int limitIndex = baseIndex + 1;
|
||||
// cases are (a) value is already set
|
||||
if (equator.isEqual(values[baseIndex], value)) return this;
|
||||
int baseCP = transitions[baseIndex];
|
||||
int limitCP = transitions[limitIndex];
|
||||
// CASE: At very start of range
|
||||
if (baseCP == codepoint) {
|
||||
boolean connectsWithPrevious =
|
||||
baseIndex != 0 && equator.isEqual(value, values[baseIndex-1]);
|
||||
|
||||
// CASE: Single codepoint range
|
||||
if (limitCP == codepoint + 1) {
|
||||
boolean connectsWithFollowing =
|
||||
baseIndex < length - 1 && equator.isEqual(value, values[limitIndex]);
|
||||
// A1a connects with previous & following, so remove index
|
||||
if (connectsWithPrevious) {
|
||||
if (connectsWithFollowing) {
|
||||
removeAt(baseIndex, 2);
|
||||
return this;
|
||||
}
|
||||
removeAt(baseIndex, 1); // extend previous
|
||||
return this;
|
||||
} else if (connectsWithFollowing) {
|
||||
removeAt(baseIndex, 1); // extend following backwards
|
||||
transitions[baseIndex] = codepoint;
|
||||
return this;
|
||||
}
|
||||
// doesn't connect on either side, just reset
|
||||
values[baseIndex] = value;
|
||||
return this;
|
||||
}
|
||||
// A.1: start of multi codepoint range
|
||||
// if connects
|
||||
if (connectsWithPrevious) {
|
||||
++transitions[baseIndex]; // extend previous
|
||||
} else {
|
||||
// otherwise insert new transition
|
||||
transitions[baseIndex] = codepoint+1; // fix following range
|
||||
insertGapAt(baseIndex, 1);
|
||||
values[baseIndex] = value;
|
||||
transitions[baseIndex] = codepoint;
|
||||
}
|
||||
missing.remove(codepoint);
|
||||
return this;
|
||||
}
|
||||
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
|
||||
if (set == null) {
|
||||
set = new UnicodeSet();
|
||||
objectToSet.put(value,set);
|
||||
// CASE: at end of range
|
||||
if (limitCP == codepoint + 1) {
|
||||
// if connects, just back up range
|
||||
boolean connectsWithFollowing =
|
||||
baseIndex < length - 1 && equator.isEqual(value, values[limitIndex]);
|
||||
|
||||
if (connectsWithFollowing) {
|
||||
--transitions[limitIndex];
|
||||
return this;
|
||||
} else {
|
||||
insertGapAt(limitIndex, 1);
|
||||
transitions[limitIndex] = codepoint;
|
||||
values[limitIndex] = value;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
set.add(codepoint);
|
||||
// CASE: in middle of range
|
||||
insertGapAt(++baseIndex,2);
|
||||
transitions[baseIndex] = codepoint;
|
||||
values[baseIndex] = value;
|
||||
transitions[++baseIndex] = codepoint + 1;
|
||||
values[baseIndex] = values[baseIndex-2]; // copy lower range values
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* Adds bunch o' codepoints; otherwise like add.
|
||||
* Sets the codepoint value.
|
||||
* @param codepoint
|
||||
* @param value
|
||||
* @return
|
||||
*/
|
||||
public UnicodeMap put(int codepoint, Object value) {
|
||||
if (codepoint < 0 || codepoint > 0x10FFFF) {
|
||||
throw new IllegalArgumentException("Codepoint out of range: " + codepoint);
|
||||
}
|
||||
_put(codepoint, value);
|
||||
if (ASSERTIONS) _checkInvariants();
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* Adds bunch o' codepoints; otherwise like put.
|
||||
* @param codepoints
|
||||
* @param value
|
||||
* @return this, for chaining
|
||||
*/
|
||||
public UnicodeMap putAll(UnicodeSet codepoints, Object value) {
|
||||
if (!missing.containsAll(codepoints)) {
|
||||
// remove from wherever it is.
|
||||
Iterator it = objectToSet.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
UnicodeSet set = (UnicodeSet) objectToSet.get(it.next());
|
||||
set.removeAll(codepoints);
|
||||
}
|
||||
// TODO optimize
|
||||
UnicodeSetIterator it = new UnicodeSetIterator(codepoints);
|
||||
while (it.next()) {
|
||||
_put(it.codepoint, value);
|
||||
}
|
||||
missing.removeAll(codepoints);
|
||||
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
|
||||
if (set == null) {
|
||||
set = new UnicodeSet();
|
||||
objectToSet.put(value,set);
|
||||
}
|
||||
set.addAll(codepoints);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -76,8 +286,15 @@ public class UnicodeMap {
|
|||
* @return this, for chaining
|
||||
*/
|
||||
public UnicodeMap putAll(int startCodePoint, int endCodePoint, Object value) {
|
||||
if (startCodePoint < 0 || endCodePoint > 0x10FFFF) {
|
||||
throw new IllegalArgumentException("Codepoint out of range: "
|
||||
+ Utility.hex(startCodePoint) + ".." + Utility.hex(endCodePoint));
|
||||
}
|
||||
// TODO optimize
|
||||
return putAll(new UnicodeSet(startCodePoint, endCodePoint), value);
|
||||
for (int i = startCodePoint; i <= endCodePoint; ++i) {
|
||||
_put(i, value);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* Add all the (main) values from a Unicode property
|
||||
|
@ -85,23 +302,22 @@ public class UnicodeMap {
|
|||
* @return
|
||||
*/
|
||||
public UnicodeMap putAll(UnicodeProperty prop) {
|
||||
UnicodeSet temp = new UnicodeSet();
|
||||
Iterator it = prop.getAliases().iterator();
|
||||
while(it.hasNext()) {
|
||||
String value = (String) it.next();
|
||||
temp.clear();
|
||||
putAll(prop.getSet(value,temp), value);
|
||||
// TODO optimize
|
||||
for (int i = 0; i <= 0x10FFFF; ++i) {
|
||||
_put(i, prop.getValue(i));
|
||||
}
|
||||
return null;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the currently unmapped Unicode code points to the given value.
|
||||
* @param value
|
||||
* @return
|
||||
*/public UnicodeMap setMissing(Object value) {
|
||||
objectToSet.put(value,missing);
|
||||
missing = new UnicodeSet();
|
||||
*/
|
||||
public UnicodeMap setMissing(Object value) {
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (values[i] == null) values[i] = value;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
|
@ -114,8 +330,9 @@ public class UnicodeMap {
|
|||
*/
|
||||
public UnicodeSet getSet(Object value, UnicodeSet result) {
|
||||
if (result == null) result = new UnicodeSet();
|
||||
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
|
||||
if (set != null) result.addAll(set);
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (values[i] == value) result.add(transitions[i], transitions[i+1]);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
|
@ -126,7 +343,12 @@ public class UnicodeMap {
|
|||
*/
|
||||
public Collection getAvailableValues(Collection result) {
|
||||
if (result == null) result = new HashSet();
|
||||
result.addAll(objectToSet.keySet());
|
||||
for (int i = 0; i < length; ++i) {
|
||||
Object value = values[i];
|
||||
if (value == null) continue;
|
||||
if (result.contains(value)) continue;
|
||||
result.add(value);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
|
@ -136,25 +358,24 @@ public class UnicodeMap {
|
|||
* @return
|
||||
*/
|
||||
public Object getValue(int codepoint) {
|
||||
if (missing.contains(codepoint)) return null;
|
||||
Iterator it = objectToSet.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object value = it.next();
|
||||
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
|
||||
if (set.contains(codepoint)) return value;
|
||||
if (codepoint < 0 || codepoint > 0x10FFFF) {
|
||||
throw new IllegalArgumentException("Codepoint out of range: " + codepoint);
|
||||
}
|
||||
return null;
|
||||
return values[findIndex(codepoint)];
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuffer result = new StringBuffer();
|
||||
Iterator it = objectToSet.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Object value = it.next();
|
||||
UnicodeSet set = (UnicodeSet) objectToSet.get(value);
|
||||
result.append(value)
|
||||
.append("=>")
|
||||
.append(set.toPattern(true))
|
||||
for (int i = 0; i < length-1; ++i) {
|
||||
Object value = values[i];
|
||||
if (value == null) continue;
|
||||
int start = transitions[i];
|
||||
int end = transitions[i+1]-1;
|
||||
result.append(Utility.hex(start));
|
||||
if (start != end) result.append("..")
|
||||
.append(Utility.hex(end));
|
||||
result.append("\t=>")
|
||||
.append(values[i] == null ? "null" : values[i].toString())
|
||||
.append("\r\n");
|
||||
}
|
||||
return result.toString();
|
||||
|
|
Loading…
Add table
Reference in a new issue