mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-2038 improve UnicodeSet performance
X-SVN-Rev: 9894
This commit is contained in:
parent
fe64e9591c
commit
63c9eace9a
1 changed files with 196 additions and 6 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
|
||||
* $Date: 2002/07/29 19:36:07 $
|
||||
* $Revision: 1.71 $
|
||||
* $Date: 2002/09/19 22:37:09 $
|
||||
* $Revision: 1.72 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -210,7 +210,7 @@ import java.util.Iterator;
|
|||
* </table>
|
||||
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.71 $ $Date: 2002/07/29 19:36:07 $
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.72 $ $Date: 2002/09/19 22:37:09 $
|
||||
*/
|
||||
public class UnicodeSet extends UnicodeFilter {
|
||||
|
||||
|
@ -873,19 +873,119 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
if (end < MIN_VALUE || end > MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(end, 6));
|
||||
}
|
||||
if (start <= end) {
|
||||
if (start < end) {
|
||||
add(range(start, end), 2, 0);
|
||||
} else if (start == end) {
|
||||
add(start);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
//| /**
|
||||
//| * Format out the inversion list as a string, for debugging. Uncomment when
|
||||
//| * needed.
|
||||
//| */
|
||||
//| private String dump() {
|
||||
//| StringBuffer buf = new StringBuffer("[");
|
||||
//| for (int i=0; i<len; ++i) {
|
||||
//| if (i != 0) buf.append(", ");
|
||||
//| int c = list[i];
|
||||
//| if (c <= 0xFF) {
|
||||
//| buf.append((char) c);
|
||||
//| } else {
|
||||
//| buf.append("U+").append(Utility.hex(c, (c<0x10000)?4:6));
|
||||
//| }
|
||||
//| }
|
||||
//| buf.append("]");
|
||||
//| return buf.toString();
|
||||
//| }
|
||||
|
||||
/**
|
||||
* Adds the specified character to this set if it is not already
|
||||
* present. If this set already contains the specified character,
|
||||
* the call leaves this set unchanged.
|
||||
*/
|
||||
public final UnicodeSet add(int c) {
|
||||
add(c, c);
|
||||
if (c < MIN_VALUE || c > MAX_VALUE) {
|
||||
throw new IllegalArgumentException("Invalid code point U+" + Utility.hex(c, 6));
|
||||
}
|
||||
|
||||
// find smallest i such that c < list[i]
|
||||
// if odd, then it is IN the set
|
||||
// if even, then it is OUT of the set
|
||||
int i = findCodePoint(c);
|
||||
|
||||
// already in set?
|
||||
if ((i & 1) != 0) return this;
|
||||
|
||||
// HIGH is 0x110000
|
||||
// assert(list[len-1] == HIGH);
|
||||
|
||||
// empty = [HIGH]
|
||||
// [start_0, limit_0, start_1, limit_1, HIGH]
|
||||
|
||||
// [..., start_i-1, limit_i-1, start_i, limit_i, ..., HIGH]
|
||||
// ^
|
||||
// list[i]
|
||||
|
||||
// i == 0 means c is before the first range
|
||||
|
||||
if (c == list[i]-1) {
|
||||
// c is before start of next range
|
||||
list[i] = c;
|
||||
// if we touched the HIGH mark, then add a new one
|
||||
if (c == MAX_VALUE) {
|
||||
ensureCapacity(len+1);
|
||||
list[len++] = HIGH;
|
||||
}
|
||||
if (i > 0 && c == list[i-1]) {
|
||||
// collapse adjacent ranges
|
||||
|
||||
// [..., start_i-1, c, c, limit_i, ..., HIGH]
|
||||
// ^
|
||||
// list[i]
|
||||
System.arraycopy(list, i+1, list, i-1, len-i-1);
|
||||
len -= 2;
|
||||
}
|
||||
}
|
||||
|
||||
else if (i > 0 && c == list[i-1]) {
|
||||
// c is after end of prior range
|
||||
list[i-1]++;
|
||||
// no need to chcek for collapse here
|
||||
}
|
||||
|
||||
else {
|
||||
// At this point we know the new char is not adjacent to
|
||||
// any existing ranges, and it is not 10FFFF.
|
||||
|
||||
|
||||
// [..., start_i-1, limit_i-1, start_i, limit_i, ..., HIGH]
|
||||
// ^
|
||||
// list[i]
|
||||
|
||||
// [..., start_i-1, limit_i-1, c, c+1, start_i, limit_i, ..., HIGH]
|
||||
// ^
|
||||
// list[i]
|
||||
|
||||
// Don't use ensureCapacity() to save on copying.
|
||||
// NOTE: This has no measurable impact on performance,
|
||||
// but it might help in some usage patterns.
|
||||
if (len+2 > list.length) {
|
||||
int[] temp = new int[len + 2 + GROW_EXTRA];
|
||||
if (i != 0) System.arraycopy(list, 0, temp, 0, i);
|
||||
System.arraycopy(list, i, temp, i+2, len-i);
|
||||
list = temp;
|
||||
} else {
|
||||
System.arraycopy(list, i, list, i+2, len-i);
|
||||
}
|
||||
|
||||
list[i] = c;
|
||||
list[i+1] = c+1;
|
||||
len += 2;
|
||||
}
|
||||
|
||||
pat = null;
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -1208,12 +1308,15 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// Return the smallest i such that c < list[i]. Assume
|
||||
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
|
||||
if (c < list[0]) return 0;
|
||||
// High runner test. c is often after the last range, so an
|
||||
// initial check for this condition pays off.
|
||||
if (len >= 2 && c >= list[len-2]) return len-1;
|
||||
int lo = 0;
|
||||
int hi = len - 1;
|
||||
// invariant: c >= list[lo]
|
||||
// invariant: c < list[hi]
|
||||
for (;;) {
|
||||
int i = (lo + hi) / 2;
|
||||
int i = (lo + hi) >>> 1;
|
||||
if (i == lo) return hi;
|
||||
if (c < list[i]) {
|
||||
hi = i;
|
||||
|
@ -1222,6 +1325,93 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Beginnings of an unrolled binary search implementation. Problems
|
||||
// to be solved:
|
||||
|
||||
// 1. Initial search in the POW2 array is slow. To make this a
|
||||
// win, do the POW2 search ONLY when len changes. Find all
|
||||
// locations where len changes and update POW2 there.
|
||||
|
||||
// 2. Array must be of size at least 2^n, where this is the
|
||||
// smallest 2^n >= actual length. This allows array indexing in
|
||||
// the case statement. Entries from len..2^n-1 must be HIGH so as
|
||||
// to not trigger the if statements. Modify all operations that
|
||||
// manipulate the list so that they ensure these conditions.
|
||||
// Alternatively, use exception handling and catch an array index
|
||||
// out of bounds, and then decrement 'power' and restart the case
|
||||
// statement.
|
||||
|
||||
// These two problems can be solved, but they are non-local
|
||||
// changes throughout the file, so it will take some work to test
|
||||
// them. I would _guess_ that the overall class performance will
|
||||
// be slower -- although it's possible that calls to contains() on
|
||||
// an unchanging set object will end up being faster. The only
|
||||
// way to tell is to complete the implementation and measure it.
|
||||
|
||||
// // The maximum possible length is HIGH/2, e.g., [ace...], = 557056
|
||||
// static final int POW2[] = {
|
||||
// 0x000001, // 2^0 = 1
|
||||
// 0x000002, // 2^1 = 2
|
||||
// 0x000004, // 2^2 = 4
|
||||
// 0x000008, // 2^3 = 8
|
||||
// 0x000010, // 2^4 = 16
|
||||
// 0x000020, // 2^5 = 32
|
||||
// 0x000040, // 2^6 = 64
|
||||
// 0x000080, // 2^7 = 128
|
||||
// 0x000100, // 2^8 = 256
|
||||
// 0x000200, // 2^9 = 512
|
||||
// 0x000400, // 2^10 = 1024
|
||||
// 0x000800, // 2^11 = 2048
|
||||
// 0x001000, // 2^12 = 4096
|
||||
// 0x002000, // 2^13 = 8192
|
||||
// 0x004000, // 2^14 = 16384
|
||||
// 0x008000, // 2^15 = 32768
|
||||
// 0x010000, // 2^16 = 65536
|
||||
// 0x020000, // 2^17 = 131072
|
||||
// 0x040000, // 2^18 = 262144
|
||||
// 0x080000, // 2^19 = 524288
|
||||
// 0x100000, // 2^20 = 1048576
|
||||
// };
|
||||
//
|
||||
// private final int findCodePoint(int c) {
|
||||
// // Return the smallest i such that c < list[i]. Assume
|
||||
// // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
|
||||
// if (c < list[0]) return 0;
|
||||
// // High runner test. c is often after the last range, so an
|
||||
// // initial check for this condition pays off.
|
||||
// if (len >= 2 && c >= list[len-2]) return len-1;
|
||||
//
|
||||
// // Find the least power of 2 greater than len
|
||||
// // TODO: Check this logic...is this what we want?
|
||||
// int power;
|
||||
// for (power=POW2.length-1; power>0 && len<POW2[power]; --power) {}
|
||||
//
|
||||
// switch (power) {
|
||||
// case 19: if (c >= list[index+0x40000]) index += 0x40000;
|
||||
// case 18: if (c >= list[index+0x20000]) index += 0x20000;
|
||||
// case 17: if (c >= list[index+0x10000]) index += 0x10000;
|
||||
// case 16: if (c >= list[index+0x08000]) index += 0x08000;
|
||||
// case 15: if (c >= list[index+0x04000]) index += 0x04000;
|
||||
// case 14: if (c >= list[index+0x02000]) index += 0x02000;
|
||||
// case 13: if (c >= list[index+0x01000]) index += 0x01000;
|
||||
// case 12: if (c >= list[index+0x00800]) index += 0x00800;
|
||||
// case 11: if (c >= list[index+0x00400]) index += 0x00400;
|
||||
// case 10: if (c >= list[index+0x00200]) index += 0x00200;
|
||||
// case 9: if (c >= list[index+0x00100]) index += 0x00100;
|
||||
// case 8: if (c >= list[index+0x00080]) index += 0x00080;
|
||||
// case 7: if (c >= list[index+0x00040]) index += 0x00040;
|
||||
// case 6: if (c >= list[index+0x00020]) index += 0x00020;
|
||||
// case 5: if (c >= list[index+0x00010]) index += 0x00010;
|
||||
// case 4: if (c >= list[index+0x00008]) index += 0x00008;
|
||||
// case 3: if (c >= list[index+0x00004]) index += 0x00004;
|
||||
// case 2: if (c >= list[index+0x00002]) index += 0x00002;
|
||||
// case 1: if (c >= list[index+0x00001]) index++;
|
||||
// case 0: if (c >= list[index]) index++;
|
||||
// }
|
||||
//
|
||||
// // TODO: double check and finish
|
||||
// }
|
||||
|
||||
/**
|
||||
* Returns true if this set contains every character
|
||||
|
|
Loading…
Add table
Reference in a new issue