mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-20250 faster MutableCodePointTrie.build(): use a hashtable to find equal blocks in earlier parts of the compacted data & index arrays
This commit is contained in:
parent
93e84caa65
commit
59006770ed
2 changed files with 519 additions and 151 deletions
|
@ -65,6 +65,7 @@ constexpr uint8_t I3_18 = 3;
|
|||
constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8;
|
||||
|
||||
class AllSameBlocks;
|
||||
class MixedBlocks;
|
||||
|
||||
class MutableCodePointTrie : public UMemory {
|
||||
public:
|
||||
|
@ -97,8 +98,10 @@ private:
|
|||
void maskValues(uint32_t mask);
|
||||
UChar32 findHighStart() const;
|
||||
int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks);
|
||||
int32_t compactData(int32_t fastILimit, uint32_t *newData, int32_t dataNullIndex);
|
||||
int32_t compactIndex(int32_t fastILimit, UErrorCode &errorCode);
|
||||
int32_t compactData(
|
||||
int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
|
||||
int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
|
||||
int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode);
|
||||
int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode);
|
||||
|
||||
uint32_t *index = nullptr;
|
||||
|
@ -553,28 +556,8 @@ void MutableCodePointTrie::maskValues(uint32_t mask) {
|
|||
}
|
||||
}
|
||||
|
||||
inline bool
|
||||
equalBlocks(const uint32_t *s, const uint32_t *t, int32_t length) {
|
||||
while (length > 0 && *s == *t) {
|
||||
++s;
|
||||
++t;
|
||||
--length;
|
||||
}
|
||||
return length == 0;
|
||||
}
|
||||
|
||||
inline bool
|
||||
equalBlocks(const uint16_t *s, const uint32_t *t, int32_t length) {
|
||||
while (length > 0 && *s == *t) {
|
||||
++s;
|
||||
++t;
|
||||
--length;
|
||||
}
|
||||
return length == 0;
|
||||
}
|
||||
|
||||
inline bool
|
||||
equalBlocks(const uint16_t *s, const uint16_t *t, int32_t length) {
|
||||
template<typename UIntA, typename UIntB>
|
||||
bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) {
|
||||
while (length > 0 && *s == *t) {
|
||||
++s;
|
||||
++t;
|
||||
|
@ -590,36 +573,6 @@ bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) {
|
|||
}
|
||||
|
||||
/** Search for an identical block. */
|
||||
int32_t findSameBlock(const uint32_t *p, int32_t pStart, int32_t length,
|
||||
const uint32_t *q, int32_t qStart, int32_t blockLength) {
|
||||
// Ensure that we do not even partially get past length.
|
||||
length -= blockLength;
|
||||
|
||||
q += qStart;
|
||||
while (pStart <= length) {
|
||||
if (equalBlocks(p + pStart, q, blockLength)) {
|
||||
return pStart;
|
||||
}
|
||||
++pStart;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
|
||||
const uint32_t *q, int32_t qStart, int32_t blockLength) {
|
||||
// Ensure that we do not even partially get past length.
|
||||
length -= blockLength;
|
||||
|
||||
q += qStart;
|
||||
while (pStart <= length) {
|
||||
if (equalBlocks(p + pStart, q, blockLength)) {
|
||||
return pStart;
|
||||
}
|
||||
++pStart;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
|
||||
const uint16_t *q, int32_t qStart, int32_t blockLength) {
|
||||
// Ensure that we do not even partially get past length.
|
||||
|
@ -660,30 +613,9 @@ int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit,
|
|||
* Look for maximum overlap of the beginning of the other block
|
||||
* with the previous, adjacent block.
|
||||
*/
|
||||
int32_t getOverlap(const uint32_t *p, int32_t length,
|
||||
const uint32_t *q, int32_t qStart, int32_t blockLength) {
|
||||
int32_t overlap = blockLength - 1;
|
||||
U_ASSERT(overlap <= length);
|
||||
q += qStart;
|
||||
while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
|
||||
--overlap;
|
||||
}
|
||||
return overlap;
|
||||
}
|
||||
|
||||
int32_t getOverlap(const uint16_t *p, int32_t length,
|
||||
const uint32_t *q, int32_t qStart, int32_t blockLength) {
|
||||
int32_t overlap = blockLength - 1;
|
||||
U_ASSERT(overlap <= length);
|
||||
q += qStart;
|
||||
while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
|
||||
--overlap;
|
||||
}
|
||||
return overlap;
|
||||
}
|
||||
|
||||
int32_t getOverlap(const uint16_t *p, int32_t length,
|
||||
const uint16_t *q, int32_t qStart, int32_t blockLength) {
|
||||
template<typename UIntA, typename UIntB>
|
||||
int32_t getOverlap(const UIntA *p, int32_t length,
|
||||
const UIntB *q, int32_t qStart, int32_t blockLength) {
|
||||
int32_t overlap = blockLength - 1;
|
||||
U_ASSERT(overlap <= length);
|
||||
q += qStart;
|
||||
|
@ -812,6 +744,171 @@ private:
|
|||
int32_t refCounts[CAPACITY];
|
||||
};
|
||||
|
||||
// Custom hash table for mixed-value blocks to be found anywhere in the
|
||||
// compacted data or index so far.
|
||||
class MixedBlocks {
|
||||
public:
|
||||
MixedBlocks() {}
|
||||
~MixedBlocks() {
|
||||
uprv_free(table);
|
||||
}
|
||||
|
||||
bool init(int32_t maxLength, int32_t newBlockLength) {
|
||||
// We store actual data indexes + 1 to reserve 0 for empty entries.
|
||||
int32_t maxDataIndex = maxLength - newBlockLength + 1;
|
||||
int32_t newLength;
|
||||
if (maxDataIndex <= 0xfff) { // 4k
|
||||
newLength = 6007;
|
||||
shift = 12;
|
||||
mask = 0xfff;
|
||||
} else if (maxDataIndex <= 0x7fff) { // 32k
|
||||
newLength = 50021;
|
||||
shift = 15;
|
||||
mask = 0x7fff;
|
||||
} else if (maxDataIndex <= 0x1ffff) { // 128k
|
||||
newLength = 200003;
|
||||
shift = 17;
|
||||
mask = 0x1ffff;
|
||||
} else {
|
||||
// maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
|
||||
newLength = 1500007;
|
||||
shift = 21;
|
||||
mask = 0x1fffff;
|
||||
}
|
||||
if (newLength > capacity) {
|
||||
uprv_free(table);
|
||||
table = (uint32_t *)uprv_malloc(newLength * 4);
|
||||
if (table == nullptr) {
|
||||
return false;
|
||||
}
|
||||
capacity = newLength;
|
||||
}
|
||||
length = newLength;
|
||||
uprv_memset(table, 0, length * 4);
|
||||
|
||||
blockLength = newBlockLength;
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename UInt>
|
||||
void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) {
|
||||
int32_t start = prevDataLength - blockLength;
|
||||
if (start >= minStart) {
|
||||
++start; // Skip the last block that we added last time.
|
||||
} else {
|
||||
start = minStart; // Begin with the first full block.
|
||||
}
|
||||
for (int32_t end = newDataLength - blockLength; start <= end; ++start) {
|
||||
uint32_t hashCode = makeHashCode(data, start);
|
||||
addEntry(data, start, hashCode, start);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename UIntA, typename UIntB>
|
||||
int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const {
|
||||
uint32_t hashCode = makeHashCode(blockData, blockStart);
|
||||
int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode);
|
||||
if (entryIndex >= 0) {
|
||||
return (table[entryIndex] & mask) - 1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const {
|
||||
uint32_t hashCode = makeHashCode(blockValue);
|
||||
int32_t entryIndex = findEntry(data, blockValue, hashCode);
|
||||
if (entryIndex >= 0) {
|
||||
return (table[entryIndex] & mask) - 1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template<typename UInt>
|
||||
uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const {
|
||||
int32_t blockLimit = blockStart + blockLength;
|
||||
uint32_t hashCode = blockData[blockStart++];
|
||||
do {
|
||||
hashCode = 37 * hashCode + blockData[blockStart++];
|
||||
} while (blockStart < blockLimit);
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
uint32_t makeHashCode(uint32_t blockValue) const {
|
||||
uint32_t hashCode = blockValue;
|
||||
for (int32_t i = 1; i < blockLength; ++i) {
|
||||
hashCode = 37 * hashCode + blockValue;
|
||||
}
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
template<typename UInt>
|
||||
void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) {
|
||||
U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask);
|
||||
int32_t entryIndex = findEntry(data, data, blockStart, hashCode);
|
||||
if (entryIndex < 0) {
|
||||
table[~entryIndex] = (hashCode << shift) | (dataIndex + 1);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename UIntA, typename UIntB>
|
||||
int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart,
|
||||
uint32_t hashCode) const {
|
||||
uint32_t shiftedHashCode = hashCode << shift;
|
||||
int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
|
||||
for (int32_t entryIndex = initialEntryIndex;;) {
|
||||
uint32_t entry = table[entryIndex];
|
||||
if (entry == 0) {
|
||||
return ~entryIndex;
|
||||
}
|
||||
if ((entry & ~mask) == shiftedHashCode) {
|
||||
int32_t dataIndex = (entry & mask) - 1;
|
||||
if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) {
|
||||
return entryIndex;
|
||||
}
|
||||
}
|
||||
entryIndex = nextIndex(initialEntryIndex, entryIndex);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const {
|
||||
uint32_t shiftedHashCode = hashCode << shift;
|
||||
int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1
|
||||
for (int32_t entryIndex = initialEntryIndex;;) {
|
||||
uint32_t entry = table[entryIndex];
|
||||
if (entry == 0) {
|
||||
return ~entryIndex;
|
||||
}
|
||||
if ((entry & ~mask) == shiftedHashCode) {
|
||||
int32_t dataIndex = (entry & mask) - 1;
|
||||
if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) {
|
||||
return entryIndex;
|
||||
}
|
||||
}
|
||||
entryIndex = nextIndex(initialEntryIndex, entryIndex);
|
||||
}
|
||||
}
|
||||
|
||||
inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const {
|
||||
// U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
|
||||
return (entryIndex + initialEntryIndex) % length;
|
||||
}
|
||||
|
||||
// Hash table.
|
||||
// The length is a prime number, larger than the maximum data length.
|
||||
// The "shift" lower bits store a data index + 1.
|
||||
// The remaining upper bits store a partial hashCode of the block data values.
|
||||
uint32_t *table = nullptr;
|
||||
int32_t capacity = 0;
|
||||
int32_t length = 0;
|
||||
int32_t shift = 0;
|
||||
uint32_t mask = 0;
|
||||
|
||||
int32_t blockLength = 0;
|
||||
};
|
||||
|
||||
int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) {
|
||||
#ifdef UCPTRIE_DEBUG
|
||||
bool overflow = false;
|
||||
|
@ -967,8 +1064,9 @@ void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value,
|
|||
*
|
||||
* It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
|
||||
*/
|
||||
int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
|
||||
uint32_t *newData, int32_t dataNullIndex) {
|
||||
int32_t MutableCodePointTrie::compactData(
|
||||
int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity,
|
||||
int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) {
|
||||
#ifdef UCPTRIE_DEBUG
|
||||
int32_t countSame=0, sumOverlaps=0;
|
||||
bool printData = dataLength == 29088 /* line.brk */ ||
|
||||
|
@ -988,8 +1086,14 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
|
|||
#endif
|
||||
}
|
||||
|
||||
int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
|
||||
int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
|
||||
if (!mixedBlocks.init(newDataCapacity, blockLength)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
mixedBlocks.extend(newData, 0, 0, newDataLength);
|
||||
|
||||
int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
|
||||
int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
|
||||
int32_t fastLength = 0;
|
||||
for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) {
|
||||
|
@ -997,12 +1101,17 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
|
|||
blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
|
||||
inc = 1;
|
||||
fastLength = newDataLength;
|
||||
if (!mixedBlocks.init(newDataCapacity, blockLength)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
mixedBlocks.extend(newData, 0, 0, newDataLength);
|
||||
}
|
||||
if (flags[i] == ALL_SAME) {
|
||||
uint32_t value = index[i];
|
||||
int32_t n;
|
||||
// Find an earlier part of the data array of length blockLength
|
||||
// that is filled with this value.
|
||||
int32_t n = mixedBlocks.findAllSameBlock(newData, value);
|
||||
// If we find a match, and the current block is the data null block,
|
||||
// and it is not a fast block but matches the start of a fast block,
|
||||
// then we need to continue looking.
|
||||
|
@ -1010,12 +1119,10 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
|
|||
// and not all of the rest of the fast block is filled with this value.
|
||||
// Otherwise trie.getRange() would detect that the fast block starts at
|
||||
// dataNullOffset and assume incorrectly that it is filled with the null value.
|
||||
for (int32_t start = 0;
|
||||
(n = findAllSameBlock(newData, start, newDataLength,
|
||||
value, blockLength)) >= 0 &&
|
||||
i == dataNullIndex && i >= fastILimit && n < fastLength &&
|
||||
isStartOfSomeFastBlock(n, index, fastILimit);
|
||||
start = n + 1) {}
|
||||
while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength &&
|
||||
isStartOfSomeFastBlock(n, index, fastILimit)) {
|
||||
n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength);
|
||||
}
|
||||
if (n >= 0) {
|
||||
DEBUG_DO(++countSame);
|
||||
index[i] = n;
|
||||
|
@ -1028,14 +1135,16 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
|
|||
}
|
||||
#endif
|
||||
index[i] = newDataLength - n;
|
||||
int32_t prevDataLength = newDataLength;
|
||||
while (n < blockLength) {
|
||||
newData[newDataLength++] = value;
|
||||
++n;
|
||||
}
|
||||
mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
|
||||
}
|
||||
} else if (flags[i] == MIXED) {
|
||||
const uint32_t *block = data + index[i];
|
||||
int32_t n = findSameBlock(newData, 0, newDataLength, block, 0, blockLength);
|
||||
int32_t n = mixedBlocks.findBlock(newData, block, 0);
|
||||
if (n >= 0) {
|
||||
DEBUG_DO(++countSame);
|
||||
index[i] = n;
|
||||
|
@ -1048,9 +1157,11 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
|
|||
}
|
||||
#endif
|
||||
index[i] = newDataLength - n;
|
||||
int32_t prevDataLength = newDataLength;
|
||||
while (n < blockLength) {
|
||||
newData[newDataLength++] = block[n++];
|
||||
}
|
||||
mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
|
||||
}
|
||||
} else /* SAME_AS */ {
|
||||
uint32_t j = index[i];
|
||||
|
@ -1066,7 +1177,8 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
|
|||
return newDataLength;
|
||||
}
|
||||
|
||||
int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &errorCode) {
|
||||
int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks,
|
||||
UErrorCode &errorCode) {
|
||||
int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3);
|
||||
if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) {
|
||||
// Only the linear fast index, no multi-stage index tables.
|
||||
|
@ -1100,6 +1212,12 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
}
|
||||
}
|
||||
|
||||
if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength);
|
||||
|
||||
// Examine index-3 blocks. For each determine one of:
|
||||
// - same as the index-3 null block
|
||||
// - same as a fast-index block
|
||||
|
@ -1110,6 +1228,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
// Also determine an upper limit for the index-3 table length.
|
||||
int32_t index3Capacity = 0;
|
||||
i3FirstNull = index3NullOffset;
|
||||
bool hasLongI3Blocks = false;
|
||||
// If the fast index covers the whole BMP, then
|
||||
// the multi-stage index is only for supplementary code points.
|
||||
// Otherwise, the multi-stage index covers all of Unicode.
|
||||
|
@ -1134,13 +1253,13 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
|
||||
} else {
|
||||
index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
|
||||
hasLongI3Blocks = true;
|
||||
}
|
||||
i3FirstNull = 0;
|
||||
}
|
||||
} else {
|
||||
if (oredI3 <= 0xffff) {
|
||||
int32_t n = findSameBlock(fastIndex, 0, fastIndexLength,
|
||||
index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
|
||||
int32_t n = mixedBlocks.findBlock(fastIndex, index, i);
|
||||
if (n >= 0) {
|
||||
flags[i] = I3_BMP;
|
||||
index[i] = n;
|
||||
|
@ -1151,6 +1270,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
} else {
|
||||
flags[i] = I3_18;
|
||||
index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
|
||||
hasLongI3Blocks = true;
|
||||
}
|
||||
}
|
||||
i = j;
|
||||
|
@ -1171,6 +1291,18 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
}
|
||||
uprv_memcpy(index16, fastIndex, fastIndexLength * 2);
|
||||
|
||||
if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
MixedBlocks longI3Blocks;
|
||||
if (hasLongI3Blocks) {
|
||||
if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Compact the index-3 table and write an uncompacted version of the index-2 table.
|
||||
uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity
|
||||
int32_t i2Length = 0;
|
||||
|
@ -1190,8 +1322,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
} else if (f == I3_BMP) {
|
||||
i3 = index[i];
|
||||
} else if (f == I3_16) {
|
||||
int32_t n = findSameBlock(index16, index3Start, indexLength,
|
||||
index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
|
||||
int32_t n = mixedBlocks.findBlock(index16, index, i);
|
||||
if (n >= 0) {
|
||||
i3 = n;
|
||||
} else {
|
||||
|
@ -1203,12 +1334,18 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
|
||||
}
|
||||
i3 = indexLength - n;
|
||||
int32_t prevIndexLength = indexLength;
|
||||
while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) {
|
||||
index16[indexLength++] = index[i + n++];
|
||||
}
|
||||
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
if (hasLongI3Blocks) {
|
||||
longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
U_ASSERT(f == I3_18);
|
||||
U_ASSERT(hasLongI3Blocks);
|
||||
// Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
|
||||
int32_t j = i;
|
||||
int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
|
||||
|
@ -1241,8 +1378,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
index16[k++] = v;
|
||||
index16[k - 9] = upperBits;
|
||||
} while (j < jLimit);
|
||||
int32_t n = findSameBlock(index16, index3Start, indexLength,
|
||||
index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
|
||||
int32_t n = longI3Blocks.findBlock(index16, index16, indexLength);
|
||||
if (n >= 0) {
|
||||
i3 = n | 0x8000;
|
||||
} else {
|
||||
|
@ -1254,6 +1390,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
|
||||
}
|
||||
i3 = (indexLength - n) | 0x8000;
|
||||
int32_t prevIndexLength = indexLength;
|
||||
if (n > 0) {
|
||||
int32_t start = indexLength;
|
||||
while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
|
||||
|
@ -1262,6 +1399,10 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
} else {
|
||||
indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
|
||||
}
|
||||
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
if (hasLongI3Blocks) {
|
||||
longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (index3NullOffset < 0 && i3FirstNull >= 0) {
|
||||
|
@ -1284,16 +1425,23 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
}
|
||||
|
||||
// Compact the index-2 table and write the index-1 table.
|
||||
static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH,
|
||||
"must re-init mixedBlocks");
|
||||
int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH;
|
||||
int32_t i1 = fastIndexLength;
|
||||
for (int32_t i = 0; i < i2Length; i += blockLength) {
|
||||
if ((i2Length - i) < blockLength) {
|
||||
int32_t n;
|
||||
if ((i2Length - i) >= blockLength) {
|
||||
// normal block
|
||||
U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH);
|
||||
n = mixedBlocks.findBlock(index16, index2, i);
|
||||
} else {
|
||||
// highStart is inside the last index-2 block. Shorten it.
|
||||
blockLength = i2Length - i;
|
||||
n = findSameBlock(index16, index3Start, indexLength,
|
||||
index2, i, blockLength);
|
||||
}
|
||||
int32_t i2;
|
||||
int32_t n = findSameBlock(index16, index3Start, indexLength,
|
||||
index2, i, blockLength);
|
||||
if (n >= 0) {
|
||||
i2 = n;
|
||||
} else {
|
||||
|
@ -1304,9 +1452,11 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error
|
|||
n = getOverlap(index16, indexLength, index2, i, blockLength);
|
||||
}
|
||||
i2 = indexLength - n;
|
||||
int32_t prevIndexLength = indexLength;
|
||||
while (n < blockLength) {
|
||||
index16[indexLength++] = index2[i + n++];
|
||||
}
|
||||
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
}
|
||||
// Set the index-1 table entry.
|
||||
index16[i1++] = i2;
|
||||
|
@ -1374,7 +1524,11 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC
|
|||
uprv_memcpy(newData, asciiData, sizeof(asciiData));
|
||||
|
||||
int32_t dataNullIndex = allSameBlocks.findMostUsed();
|
||||
int32_t newDataLength = compactData(fastILimit, newData, dataNullIndex);
|
||||
|
||||
MixedBlocks mixedBlocks;
|
||||
int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity,
|
||||
dataNullIndex, mixedBlocks, errorCode);
|
||||
if (U_FAILURE(errorCode)) { return 0; }
|
||||
U_ASSERT(newDataLength <= newDataCapacity);
|
||||
uprv_free(data);
|
||||
data = newData;
|
||||
|
@ -1399,7 +1553,7 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC
|
|||
dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET;
|
||||
}
|
||||
|
||||
int32_t indexLength = compactIndex(fastILimit, errorCode);
|
||||
int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode);
|
||||
highStart = realHighStart;
|
||||
return indexLength;
|
||||
}
|
||||
|
|
|
@ -526,34 +526,6 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
}
|
||||
|
||||
/** Search for an identical block. */
|
||||
private static int findSameBlock(int[] p, int pStart, int length,
|
||||
int[] q, int qStart, int blockLength) {
|
||||
// Ensure that we do not even partially get past length.
|
||||
length -= blockLength;
|
||||
|
||||
while (pStart <= length) {
|
||||
if (equalBlocks(p, pStart, q, qStart, blockLength)) {
|
||||
return pStart;
|
||||
}
|
||||
++pStart;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private static int findSameBlock(char[] p, int pStart, int length,
|
||||
int[] q, int qStart, int blockLength) {
|
||||
// Ensure that we do not even partially get past length.
|
||||
length -= blockLength;
|
||||
|
||||
while (pStart <= length) {
|
||||
if (equalBlocks(p, pStart, q, qStart, blockLength)) {
|
||||
return pStart;
|
||||
}
|
||||
++pStart;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private static int findSameBlock(char[] p, int pStart, int length,
|
||||
char[] q, int qStart, int blockLength) {
|
||||
// Ensure that we do not even partially get past length.
|
||||
|
@ -738,6 +710,208 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
private int[] refCounts = new int[CAPACITY];
|
||||
}
|
||||
|
||||
// Custom hash table for mixed-value blocks to be found anywhere in the
|
||||
// compacted data or index so far.
|
||||
private static final class MixedBlocks {
|
||||
void init(int maxLength, int newBlockLength) {
|
||||
// We store actual data indexes + 1 to reserve 0 for empty entries.
|
||||
int maxDataIndex = maxLength - newBlockLength + 1;
|
||||
int newLength;
|
||||
if (maxDataIndex <= 0xfff) { // 4k
|
||||
newLength = 6007;
|
||||
shift = 12;
|
||||
mask = 0xfff;
|
||||
} else if (maxDataIndex <= 0x7fff) { // 32k
|
||||
newLength = 50021;
|
||||
shift = 15;
|
||||
mask = 0x7fff;
|
||||
} else if (maxDataIndex <= 0x1ffff) { // 128k
|
||||
newLength = 200003;
|
||||
shift = 17;
|
||||
mask = 0x1ffff;
|
||||
} else {
|
||||
// maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M
|
||||
newLength = 1500007;
|
||||
shift = 21;
|
||||
mask = 0x1fffff;
|
||||
}
|
||||
if (table == null || newLength > table.length) {
|
||||
table = new int[newLength];
|
||||
} else {
|
||||
Arrays.fill(table, 0, newLength, 0);
|
||||
}
|
||||
length = newLength;
|
||||
|
||||
blockLength = newBlockLength;
|
||||
}
|
||||
|
||||
void extend(int[] data, int minStart, int prevDataLength, int newDataLength) {
|
||||
int start = prevDataLength - blockLength;
|
||||
if (start >= minStart) {
|
||||
++start; // Skip the last block that we added last time.
|
||||
} else {
|
||||
start = minStart; // Begin with the first full block.
|
||||
}
|
||||
for (int end = newDataLength - blockLength; start <= end; ++start) {
|
||||
int hashCode = makeHashCode(data, start);
|
||||
addEntry(data, null, start, hashCode, start);
|
||||
}
|
||||
}
|
||||
|
||||
void extend(char[] data, int minStart, int prevDataLength, int newDataLength) {
|
||||
int start = prevDataLength - blockLength;
|
||||
if (start >= minStart) {
|
||||
++start; // Skip the last block that we added last time.
|
||||
} else {
|
||||
start = minStart; // Begin with the first full block.
|
||||
}
|
||||
for (int end = newDataLength - blockLength; start <= end; ++start) {
|
||||
int hashCode = makeHashCode(data, start);
|
||||
addEntry(null, data, start, hashCode, start);
|
||||
}
|
||||
}
|
||||
|
||||
int findBlock(int[] data, int[] blockData, int blockStart) {
|
||||
int hashCode = makeHashCode(blockData, blockStart);
|
||||
int entryIndex = findEntry(data, null, blockData, null, blockStart, hashCode);
|
||||
if (entryIndex >= 0) {
|
||||
return (table[entryIndex] & mask) - 1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int findBlock(char[] data, int[] blockData, int blockStart) {
|
||||
int hashCode = makeHashCode(blockData, blockStart);
|
||||
int entryIndex = findEntry(null, data, blockData, null, blockStart, hashCode);
|
||||
if (entryIndex >= 0) {
|
||||
return (table[entryIndex] & mask) - 1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int findBlock(char[] data, char[] blockData, int blockStart) {
|
||||
int hashCode = makeHashCode(blockData, blockStart);
|
||||
int entryIndex = findEntry(null, data, null, blockData, blockStart, hashCode);
|
||||
if (entryIndex >= 0) {
|
||||
return (table[entryIndex] & mask) - 1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
int findAllSameBlock(int[] data, int blockValue) {
|
||||
int hashCode = makeHashCode(blockValue);
|
||||
int entryIndex = findEntry(data, blockValue, hashCode);
|
||||
if (entryIndex >= 0) {
|
||||
return (table[entryIndex] & mask) - 1;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private int makeHashCode(int[] blockData, int blockStart) {
|
||||
int blockLimit = blockStart + blockLength;
|
||||
int hashCode = blockData[blockStart++];
|
||||
do {
|
||||
hashCode = 37 * hashCode + blockData[blockStart++];
|
||||
} while (blockStart < blockLimit);
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
private int makeHashCode(char[] blockData, int blockStart) {
|
||||
int blockLimit = blockStart + blockLength;
|
||||
int hashCode = blockData[blockStart++];
|
||||
do {
|
||||
hashCode = 37 * hashCode + blockData[blockStart++];
|
||||
} while (blockStart < blockLimit);
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
private int makeHashCode(int blockValue) {
|
||||
int hashCode = blockValue;
|
||||
for (int i = 1; i < blockLength; ++i) {
|
||||
hashCode = 37 * hashCode + blockValue;
|
||||
}
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
private void addEntry(int[] data32, char[] data16, int blockStart, int hashCode, int dataIndex) {
|
||||
assert(0 <= dataIndex && dataIndex < mask);
|
||||
int entryIndex = findEntry(data32, data16, data32, data16, blockStart, hashCode);
|
||||
if (entryIndex < 0) {
|
||||
table[~entryIndex] = (hashCode << shift) | (dataIndex + 1);
|
||||
}
|
||||
}
|
||||
|
||||
private int findEntry(int[] data32, char[] data16,
|
||||
int[] blockData32, char[] blockData16, int blockStart, int hashCode) {
|
||||
int shiftedHashCode = hashCode << shift;
|
||||
int initialEntryIndex = modulo(hashCode, length - 1) + 1; // 1..length-1
|
||||
for (int entryIndex = initialEntryIndex;;) {
|
||||
int entry = table[entryIndex];
|
||||
if (entry == 0) {
|
||||
return ~entryIndex;
|
||||
}
|
||||
if ((entry & ~mask) == shiftedHashCode) {
|
||||
int dataIndex = (entry & mask) - 1;
|
||||
if (data32 != null ?
|
||||
equalBlocks(data32, dataIndex, blockData32, blockStart, blockLength) :
|
||||
blockData32 != null ?
|
||||
equalBlocks(data16, dataIndex, blockData32, blockStart, blockLength) :
|
||||
equalBlocks(data16, dataIndex, blockData16, blockStart, blockLength)) {
|
||||
return entryIndex;
|
||||
}
|
||||
}
|
||||
entryIndex = nextIndex(initialEntryIndex, entryIndex);
|
||||
}
|
||||
}
|
||||
|
||||
private int findEntry(int[] data, int blockValue, int hashCode) {
|
||||
int shiftedHashCode = hashCode << shift;
|
||||
int initialEntryIndex = modulo(hashCode, length - 1) + 1; // 1..length-1
|
||||
for (int entryIndex = initialEntryIndex;;) {
|
||||
int entry = table[entryIndex];
|
||||
if (entry == 0) {
|
||||
return ~entryIndex;
|
||||
}
|
||||
if ((entry & ~mask) == shiftedHashCode) {
|
||||
int dataIndex = (entry & mask) - 1;
|
||||
if (allValuesSameAs(data, dataIndex, blockLength, blockValue)) {
|
||||
return entryIndex;
|
||||
}
|
||||
}
|
||||
entryIndex = nextIndex(initialEntryIndex, entryIndex);
|
||||
}
|
||||
}
|
||||
|
||||
private int nextIndex(int initialEntryIndex, int entryIndex) {
|
||||
// U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length);
|
||||
return (entryIndex + initialEntryIndex) % length;
|
||||
}
|
||||
|
||||
/** Ensures non-negative n % m (that is 0..m-1). */
|
||||
private int modulo(int n, int m) {
|
||||
int i = n % m;
|
||||
if (i < 0) {
|
||||
i += m;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
// Hash table.
|
||||
// The length is a prime number, larger than the maximum data length.
|
||||
// The "shift" lower bits store a data index + 1.
|
||||
// The remaining upper bits store a partial hashCode of the block data values.
|
||||
private int[] table;
|
||||
private int length;
|
||||
private int shift;
|
||||
private int mask;
|
||||
|
||||
private int blockLength;
|
||||
}
|
||||
|
||||
private int compactWholeDataBlocks(int fastILimit, AllSameBlocks allSameBlocks) {
|
||||
// ASCII data will be stored as a linear table, even if the following code
|
||||
// does not yet count it that way.
|
||||
|
@ -836,7 +1010,8 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
*
|
||||
* It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
|
||||
*/
|
||||
private int compactData(int fastILimit, int[] newData, int dataNullIndex) {
|
||||
private int compactData(
|
||||
int fastILimit, int[] newData, int dataNullIndex, MixedBlocks mixedBlocks) {
|
||||
// The linear ASCII data has been copied into newData already.
|
||||
int newDataLength = 0;
|
||||
for (int i = 0; newDataLength < ASCII_LIMIT;
|
||||
|
@ -844,8 +1019,11 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
index[i] = newDataLength;
|
||||
}
|
||||
|
||||
int iLimit = highStart >> CodePointTrie.SHIFT_3;
|
||||
int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH;
|
||||
mixedBlocks.init(newData.length, blockLength);
|
||||
mixedBlocks.extend(newData, 0, 0, newDataLength);
|
||||
|
||||
int iLimit = highStart >> CodePointTrie.SHIFT_3;
|
||||
int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
|
||||
int fastLength = 0;
|
||||
for (int i = ASCII_I_LIMIT; i < iLimit; i += inc) {
|
||||
|
@ -853,11 +1031,14 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
|
||||
inc = 1;
|
||||
fastLength = newDataLength;
|
||||
mixedBlocks.init(newData.length, blockLength);
|
||||
mixedBlocks.extend(newData, 0, 0, newDataLength);
|
||||
}
|
||||
if (flags[i] == ALL_SAME) {
|
||||
int value = index[i];
|
||||
// Find an earlier part of the data array of length blockLength
|
||||
// that is filled with this value.
|
||||
int n = mixedBlocks.findAllSameBlock(newData, value);
|
||||
// If we find a match, and the current block is the data null block,
|
||||
// and it is not a fast block but matches the start of a fast block,
|
||||
// then we need to continue looking.
|
||||
|
@ -865,34 +1046,35 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
// and not all of the rest of the fast block is filled with this value.
|
||||
// Otherwise trie.getRange() would detect that the fast block starts at
|
||||
// dataNullOffset and assume incorrectly that it is filled with the null value.
|
||||
int n;
|
||||
for (int start = 0;
|
||||
(n = findAllSameBlock(newData, start, newDataLength,
|
||||
value, blockLength)) >= 0 &&
|
||||
i == dataNullIndex && i >= fastILimit && n < fastLength &&
|
||||
isStartOfSomeFastBlock(n, index, fastILimit);
|
||||
start = n + 1) {}
|
||||
while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength &&
|
||||
isStartOfSomeFastBlock(n, index, fastILimit)) {
|
||||
n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength);
|
||||
}
|
||||
if (n >= 0) {
|
||||
index[i] = n;
|
||||
} else {
|
||||
n = getAllSameOverlap(newData, newDataLength, value, blockLength);
|
||||
index[i] = newDataLength - n;
|
||||
int prevDataLength = newDataLength;
|
||||
while (n < blockLength) {
|
||||
newData[newDataLength++] = value;
|
||||
++n;
|
||||
}
|
||||
mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
|
||||
}
|
||||
} else if (flags[i] == MIXED) {
|
||||
int block = index[i];
|
||||
int n = findSameBlock(newData, 0, newDataLength, data, block, blockLength);
|
||||
int n = mixedBlocks.findBlock(newData, data, block);
|
||||
if (n >= 0) {
|
||||
index[i] = n;
|
||||
} else {
|
||||
n = getOverlap(newData, newDataLength, data, block, blockLength);
|
||||
index[i] = newDataLength - n;
|
||||
int prevDataLength = newDataLength;
|
||||
while (n < blockLength) {
|
||||
newData[newDataLength++] = data[block + n++];
|
||||
}
|
||||
mixedBlocks.extend(newData, 0, prevDataLength, newDataLength);
|
||||
}
|
||||
} else /* SAME_AS */ {
|
||||
int j = index[i];
|
||||
|
@ -903,7 +1085,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
return newDataLength;
|
||||
}
|
||||
|
||||
private int compactIndex(int fastILimit) {
|
||||
private int compactIndex(int fastILimit, MixedBlocks mixedBlocks) {
|
||||
int fastIndexLength = fastILimit >> (CodePointTrie.FAST_SHIFT - CodePointTrie.SHIFT_3);
|
||||
if ((highStart >> CodePointTrie.FAST_SHIFT) <= fastIndexLength) {
|
||||
// Only the linear fast index, no multi-stage index tables.
|
||||
|
@ -937,6 +1119,9 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
}
|
||||
}
|
||||
|
||||
mixedBlocks.init(fastIndexLength, CodePointTrie.INDEX_3_BLOCK_LENGTH);
|
||||
mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength);
|
||||
|
||||
// Examine index-3 blocks. For each determine one of:
|
||||
// - same as the index-3 null block
|
||||
// - same as a fast-index block
|
||||
|
@ -947,6 +1132,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
// Also determine an upper limit for the index-3 table length.
|
||||
int index3Capacity = 0;
|
||||
i3FirstNull = index3NullOffset;
|
||||
boolean hasLongI3Blocks = false;
|
||||
// If the fast index covers the whole BMP, then
|
||||
// the multi-stage index is only for supplementary code points.
|
||||
// Otherwise, the multi-stage index covers all of Unicode.
|
||||
|
@ -971,13 +1157,13 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
index3Capacity += CodePointTrie.INDEX_3_BLOCK_LENGTH;
|
||||
} else {
|
||||
index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
|
||||
hasLongI3Blocks = true;
|
||||
}
|
||||
i3FirstNull = 0;
|
||||
}
|
||||
} else {
|
||||
if (oredI3 <= 0xffff) {
|
||||
int n = findSameBlock(fastIndex, 0, fastIndexLength,
|
||||
index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH);
|
||||
int n = mixedBlocks.findBlock(fastIndex, index, i);
|
||||
if (n >= 0) {
|
||||
flags[i] = I3_BMP;
|
||||
index[i] = n;
|
||||
|
@ -988,6 +1174,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
} else {
|
||||
flags[i] = I3_18;
|
||||
index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
|
||||
hasLongI3Blocks = true;
|
||||
}
|
||||
}
|
||||
i = j;
|
||||
|
@ -1003,6 +1190,13 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
int index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1;
|
||||
index16 = Arrays.copyOf(fastIndex, index16Capacity);
|
||||
|
||||
mixedBlocks.init(index16Capacity, CodePointTrie.INDEX_3_BLOCK_LENGTH);
|
||||
MixedBlocks longI3Blocks = null;
|
||||
if (hasLongI3Blocks) {
|
||||
longI3Blocks = new MixedBlocks();
|
||||
longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH);
|
||||
}
|
||||
|
||||
// Compact the index-3 table and write an uncompacted version of the index-2 table.
|
||||
char[] index2 = new char[index2Capacity];
|
||||
int i2Length = 0;
|
||||
|
@ -1022,8 +1216,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
} else if (f == I3_BMP) {
|
||||
i3 = index[i];
|
||||
} else if (f == I3_16) {
|
||||
int n = findSameBlock(index16, index3Start, indexLength,
|
||||
index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH);
|
||||
int n = mixedBlocks.findBlock(index16, index, i);
|
||||
if (n >= 0) {
|
||||
i3 = n;
|
||||
} else {
|
||||
|
@ -1035,12 +1228,18 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH);
|
||||
}
|
||||
i3 = indexLength - n;
|
||||
int prevIndexLength = indexLength;
|
||||
while (n < CodePointTrie.INDEX_3_BLOCK_LENGTH) {
|
||||
index16[indexLength++] = (char)index[i + n++];
|
||||
}
|
||||
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
if (hasLongI3Blocks) {
|
||||
longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert(f == I3_18);
|
||||
assert(hasLongI3Blocks);
|
||||
// Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
|
||||
int j = i;
|
||||
int jLimit = i + CodePointTrie.INDEX_3_BLOCK_LENGTH;
|
||||
|
@ -1073,8 +1272,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
index16[k++] = (char)v;
|
||||
index16[k - 9] = (char)upperBits;
|
||||
} while (j < jLimit);
|
||||
int n = findSameBlock(index16, index3Start, indexLength,
|
||||
index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
|
||||
int n = longI3Blocks.findBlock(index16, index16, indexLength);
|
||||
if (n >= 0) {
|
||||
i3 = n | 0x8000;
|
||||
} else {
|
||||
|
@ -1086,6 +1284,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
|
||||
}
|
||||
i3 = (indexLength - n) | 0x8000;
|
||||
int prevIndexLength = indexLength;
|
||||
if (n > 0) {
|
||||
int start = indexLength;
|
||||
while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
|
||||
|
@ -1094,6 +1293,10 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
} else {
|
||||
indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
|
||||
}
|
||||
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
if (hasLongI3Blocks) {
|
||||
longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (index3NullOffset < 0 && i3FirstNull >= 0) {
|
||||
|
@ -1116,16 +1319,23 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
}
|
||||
|
||||
// Compact the index-2 table and write the index-1 table.
|
||||
// assert(CodePointTrie.INDEX_2_BLOCK_LENGTH == CodePointTrie.INDEX_3_BLOCK_LENGTH) :
|
||||
// "must re-init mixedBlocks";
|
||||
int blockLength = CodePointTrie.INDEX_2_BLOCK_LENGTH;
|
||||
int i1 = fastIndexLength;
|
||||
for (int i = 0; i < i2Length; i += blockLength) {
|
||||
if ((i2Length - i) < blockLength) {
|
||||
int n;
|
||||
if ((i2Length - i) >= blockLength) {
|
||||
// normal block
|
||||
assert(blockLength == CodePointTrie.INDEX_2_BLOCK_LENGTH);
|
||||
n = mixedBlocks.findBlock(index16, index2, i);
|
||||
} else {
|
||||
// highStart is inside the last index-2 block. Shorten it.
|
||||
blockLength = i2Length - i;
|
||||
n = findSameBlock(index16, index3Start, indexLength,
|
||||
index2, i, blockLength);
|
||||
}
|
||||
int i2;
|
||||
int n = findSameBlock(index16, index3Start, indexLength,
|
||||
index2, i, blockLength);
|
||||
if (n >= 0) {
|
||||
i2 = n;
|
||||
} else {
|
||||
|
@ -1136,9 +1346,11 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
n = getOverlap(index16, indexLength, index2, i, blockLength);
|
||||
}
|
||||
i2 = indexLength - n;
|
||||
int prevIndexLength = indexLength;
|
||||
while (n < blockLength) {
|
||||
index16[indexLength++] = index2[i + n++];
|
||||
}
|
||||
mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength);
|
||||
}
|
||||
// Set the index-1 table entry.
|
||||
index16[i1++] = (char)i2;
|
||||
|
@ -1186,7 +1398,9 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
int[] newData = Arrays.copyOf(asciiData, newDataCapacity);
|
||||
|
||||
int dataNullIndex = allSameBlocks.findMostUsed();
|
||||
int newDataLength = compactData(fastILimit, newData, dataNullIndex);
|
||||
|
||||
MixedBlocks mixedBlocks = new MixedBlocks();
|
||||
int newDataLength = compactData(fastILimit, newData, dataNullIndex, mixedBlocks);
|
||||
assert(newDataLength <= newDataCapacity);
|
||||
data = newData;
|
||||
dataLength = newDataLength;
|
||||
|
@ -1203,7 +1417,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
dataNullOffset = CodePointTrie.NO_DATA_NULL_OFFSET;
|
||||
}
|
||||
|
||||
int indexLength = compactIndex(fastILimit);
|
||||
int indexLength = compactIndex(fastILimit, mixedBlocks);
|
||||
highStart = realHighStart;
|
||||
return indexLength;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue