diff --git a/icu4c/source/common/umutablecptrie.cpp b/icu4c/source/common/umutablecptrie.cpp index 2e4e4722d92..b649d1e7b64 100644 --- a/icu4c/source/common/umutablecptrie.cpp +++ b/icu4c/source/common/umutablecptrie.cpp @@ -65,6 +65,7 @@ constexpr uint8_t I3_18 = 3; constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8; class AllSameBlocks; +class MixedBlocks; class MutableCodePointTrie : public UMemory { public: @@ -97,8 +98,10 @@ private: void maskValues(uint32_t mask); UChar32 findHighStart() const; int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks); - int32_t compactData(int32_t fastILimit, uint32_t *newData, int32_t dataNullIndex); - int32_t compactIndex(int32_t fastILimit, UErrorCode &errorCode); + int32_t compactData( + int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, + int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode); + int32_t compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, UErrorCode &errorCode); int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode); uint32_t *index = nullptr; @@ -553,28 +556,8 @@ void MutableCodePointTrie::maskValues(uint32_t mask) { } } -inline bool -equalBlocks(const uint32_t *s, const uint32_t *t, int32_t length) { - while (length > 0 && *s == *t) { - ++s; - ++t; - --length; - } - return length == 0; -} - -inline bool -equalBlocks(const uint16_t *s, const uint32_t *t, int32_t length) { - while (length > 0 && *s == *t) { - ++s; - ++t; - --length; - } - return length == 0; -} - -inline bool -equalBlocks(const uint16_t *s, const uint16_t *t, int32_t length) { +template +bool equalBlocks(const UIntA *s, const UIntB *t, int32_t length) { while (length > 0 && *s == *t) { ++s; ++t; @@ -590,36 +573,6 @@ bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) { } /** Search for an identical block. */ -int32_t findSameBlock(const uint32_t *p, int32_t pStart, int32_t length, - const uint32_t *q, int32_t qStart, int32_t blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; - - q += qStart; - while (pStart <= length) { - if (equalBlocks(p + pStart, q, blockLength)) { - return pStart; - } - ++pStart; - } - return -1; -} - -int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, - const uint32_t *q, int32_t qStart, int32_t blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; - - q += qStart; - while (pStart <= length) { - if (equalBlocks(p + pStart, q, blockLength)) { - return pStart; - } - ++pStart; - } - return -1; -} - int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length, const uint16_t *q, int32_t qStart, int32_t blockLength) { // Ensure that we do not even partially get past length. @@ -660,30 +613,9 @@ int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit, * Look for maximum overlap of the beginning of the other block * with the previous, adjacent block. */ -int32_t getOverlap(const uint32_t *p, int32_t length, - const uint32_t *q, int32_t qStart, int32_t blockLength) { - int32_t overlap = blockLength - 1; - U_ASSERT(overlap <= length); - q += qStart; - while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { - --overlap; - } - return overlap; -} - -int32_t getOverlap(const uint16_t *p, int32_t length, - const uint32_t *q, int32_t qStart, int32_t blockLength) { - int32_t overlap = blockLength - 1; - U_ASSERT(overlap <= length); - q += qStart; - while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) { - --overlap; - } - return overlap; -} - -int32_t getOverlap(const uint16_t *p, int32_t length, - const uint16_t *q, int32_t qStart, int32_t blockLength) { +template +int32_t getOverlap(const UIntA *p, int32_t length, + const UIntB *q, int32_t qStart, int32_t blockLength) { int32_t overlap = blockLength - 1; U_ASSERT(overlap <= length); q += qStart; @@ -812,6 +744,171 @@ private: int32_t refCounts[CAPACITY]; }; +// Custom hash table for mixed-value blocks to be found anywhere in the +// compacted data or index so far. +class MixedBlocks { +public: + MixedBlocks() {} + ~MixedBlocks() { + uprv_free(table); + } + + bool init(int32_t maxLength, int32_t newBlockLength) { + // We store actual data indexes + 1 to reserve 0 for empty entries. + int32_t maxDataIndex = maxLength - newBlockLength + 1; + int32_t newLength; + if (maxDataIndex <= 0xfff) { // 4k + newLength = 6007; + shift = 12; + mask = 0xfff; + } else if (maxDataIndex <= 0x7fff) { // 32k + newLength = 50021; + shift = 15; + mask = 0x7fff; + } else if (maxDataIndex <= 0x1ffff) { // 128k + newLength = 200003; + shift = 17; + mask = 0x1ffff; + } else { + // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M + newLength = 1500007; + shift = 21; + mask = 0x1fffff; + } + if (newLength > capacity) { + uprv_free(table); + table = (uint32_t *)uprv_malloc(newLength * 4); + if (table == nullptr) { + return false; + } + capacity = newLength; + } + length = newLength; + uprv_memset(table, 0, length * 4); + + blockLength = newBlockLength; + return true; + } + + template + void extend(const UInt *data, int32_t minStart, int32_t prevDataLength, int32_t newDataLength) { + int32_t start = prevDataLength - blockLength; + if (start >= minStart) { + ++start; // Skip the last block that we added last time. + } else { + start = minStart; // Begin with the first full block. + } + for (int32_t end = newDataLength - blockLength; start <= end; ++start) { + uint32_t hashCode = makeHashCode(data, start); + addEntry(data, start, hashCode, start); + } + } + + template + int32_t findBlock(const UIntA *data, const UIntB *blockData, int32_t blockStart) const { + uint32_t hashCode = makeHashCode(blockData, blockStart); + int32_t entryIndex = findEntry(data, blockData, blockStart, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + + int32_t findAllSameBlock(const uint32_t *data, uint32_t blockValue) const { + uint32_t hashCode = makeHashCode(blockValue); + int32_t entryIndex = findEntry(data, blockValue, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + +private: + template + uint32_t makeHashCode(const UInt *blockData, int32_t blockStart) const { + int32_t blockLimit = blockStart + blockLength; + uint32_t hashCode = blockData[blockStart++]; + do { + hashCode = 37 * hashCode + blockData[blockStart++]; + } while (blockStart < blockLimit); + return hashCode; + } + + uint32_t makeHashCode(uint32_t blockValue) const { + uint32_t hashCode = blockValue; + for (int32_t i = 1; i < blockLength; ++i) { + hashCode = 37 * hashCode + blockValue; + } + return hashCode; + } + + template + void addEntry(const UInt *data, int32_t blockStart, uint32_t hashCode, int32_t dataIndex) { + U_ASSERT(0 <= dataIndex && dataIndex < (int32_t)mask); + int32_t entryIndex = findEntry(data, data, blockStart, hashCode); + if (entryIndex < 0) { + table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); + } + } + + template + int32_t findEntry(const UIntA *data, const UIntB *blockData, int32_t blockStart, + uint32_t hashCode) const { + uint32_t shiftedHashCode = hashCode << shift; + int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 + for (int32_t entryIndex = initialEntryIndex;;) { + uint32_t entry = table[entryIndex]; + if (entry == 0) { + return ~entryIndex; + } + if ((entry & ~mask) == shiftedHashCode) { + int32_t dataIndex = (entry & mask) - 1; + if (equalBlocks(data + dataIndex, blockData + blockStart, blockLength)) { + return entryIndex; + } + } + entryIndex = nextIndex(initialEntryIndex, entryIndex); + } + } + + int32_t findEntry(const uint32_t *data, uint32_t blockValue, uint32_t hashCode) const { + uint32_t shiftedHashCode = hashCode << shift; + int32_t initialEntryIndex = (hashCode % (length - 1)) + 1; // 1..length-1 + for (int32_t entryIndex = initialEntryIndex;;) { + uint32_t entry = table[entryIndex]; + if (entry == 0) { + return ~entryIndex; + } + if ((entry & ~mask) == shiftedHashCode) { + int32_t dataIndex = (entry & mask) - 1; + if (allValuesSameAs(data + dataIndex, blockLength, blockValue)) { + return entryIndex; + } + } + entryIndex = nextIndex(initialEntryIndex, entryIndex); + } + } + + inline int32_t nextIndex(int32_t initialEntryIndex, int32_t entryIndex) const { + // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); + return (entryIndex + initialEntryIndex) % length; + } + + // Hash table. + // The length is a prime number, larger than the maximum data length. + // The "shift" lower bits store a data index + 1. + // The remaining upper bits store a partial hashCode of the block data values. + uint32_t *table = nullptr; + int32_t capacity = 0; + int32_t length = 0; + int32_t shift = 0; + uint32_t mask = 0; + + int32_t blockLength = 0; +}; + int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) { #ifdef UCPTRIE_DEBUG bool overflow = false; @@ -967,8 +1064,9 @@ void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value, * * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. */ -int32_t MutableCodePointTrie::compactData(int32_t fastILimit, - uint32_t *newData, int32_t dataNullIndex) { +int32_t MutableCodePointTrie::compactData( + int32_t fastILimit, uint32_t *newData, int32_t newDataCapacity, + int32_t dataNullIndex, MixedBlocks &mixedBlocks, UErrorCode &errorCode) { #ifdef UCPTRIE_DEBUG int32_t countSame=0, sumOverlaps=0; bool printData = dataLength == 29088 /* line.brk */ || @@ -988,8 +1086,14 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit, #endif } - int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH; + if (!mixedBlocks.init(newDataCapacity, blockLength)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(newData, 0, 0, newDataLength); + + int32_t iLimit = highStart >> UCPTRIE_SHIFT_3; int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; int32_t fastLength = 0; for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) { @@ -997,12 +1101,17 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit, blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH; inc = 1; fastLength = newDataLength; + if (!mixedBlocks.init(newDataCapacity, blockLength)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(newData, 0, 0, newDataLength); } if (flags[i] == ALL_SAME) { uint32_t value = index[i]; - int32_t n; // Find an earlier part of the data array of length blockLength // that is filled with this value. + int32_t n = mixedBlocks.findAllSameBlock(newData, value); // If we find a match, and the current block is the data null block, // and it is not a fast block but matches the start of a fast block, // then we need to continue looking. @@ -1010,12 +1119,10 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit, // and not all of the rest of the fast block is filled with this value. // Otherwise trie.getRange() would detect that the fast block starts at // dataNullOffset and assume incorrectly that it is filled with the null value. - for (int32_t start = 0; - (n = findAllSameBlock(newData, start, newDataLength, - value, blockLength)) >= 0 && - i == dataNullIndex && i >= fastILimit && n < fastLength && - isStartOfSomeFastBlock(n, index, fastILimit); - start = n + 1) {} + while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && + isStartOfSomeFastBlock(n, index, fastILimit)) { + n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); + } if (n >= 0) { DEBUG_DO(++countSame); index[i] = n; @@ -1028,14 +1135,16 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit, } #endif index[i] = newDataLength - n; + int32_t prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = value; ++n; } + mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else if (flags[i] == MIXED) { const uint32_t *block = data + index[i]; - int32_t n = findSameBlock(newData, 0, newDataLength, block, 0, blockLength); + int32_t n = mixedBlocks.findBlock(newData, block, 0); if (n >= 0) { DEBUG_DO(++countSame); index[i] = n; @@ -1048,9 +1157,11 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit, } #endif index[i] = newDataLength - n; + int32_t prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = block[n++]; } + mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else /* SAME_AS */ { uint32_t j = index[i]; @@ -1066,7 +1177,8 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit, return newDataLength; } -int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &errorCode) { +int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, MixedBlocks &mixedBlocks, + UErrorCode &errorCode) { int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3); if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) { // Only the linear fast index, no multi-stage index tables. @@ -1100,6 +1212,12 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error } } + if (!mixedBlocks.init(fastIndexLength, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); + // Examine index-3 blocks. For each determine one of: // - same as the index-3 null block // - same as a fast-index block @@ -1110,6 +1228,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error // Also determine an upper limit for the index-3 table length. int32_t index3Capacity = 0; i3FirstNull = index3NullOffset; + bool hasLongI3Blocks = false; // If the fast index covers the whole BMP, then // the multi-stage index is only for supplementary code points. // Otherwise, the multi-stage index covers all of Unicode. @@ -1134,13 +1253,13 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH; } else { index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; + hasLongI3Blocks = true; } i3FirstNull = 0; } } else { if (oredI3 <= 0xffff) { - int32_t n = findSameBlock(fastIndex, 0, fastIndexLength, - index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); + int32_t n = mixedBlocks.findBlock(fastIndex, index, i); if (n >= 0) { flags[i] = I3_BMP; index[i] = n; @@ -1151,6 +1270,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error } else { flags[i] = I3_18; index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; + hasLongI3Blocks = true; } } i = j; @@ -1171,6 +1291,18 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error } uprv_memcpy(index16, fastIndex, fastIndexLength * 2); + if (!mixedBlocks.init(index16Capacity, UCPTRIE_INDEX_3_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + MixedBlocks longI3Blocks; + if (hasLongI3Blocks) { + if (!longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH)) { + errorCode = U_MEMORY_ALLOCATION_ERROR; + return 0; + } + } + // Compact the index-3 table and write an uncompacted version of the index-2 table. uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity int32_t i2Length = 0; @@ -1190,8 +1322,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error } else if (f == I3_BMP) { i3 = index[i]; } else if (f == I3_16) { - int32_t n = findSameBlock(index16, index3Start, indexLength, - index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); + int32_t n = mixedBlocks.findBlock(index16, index, i); if (n >= 0) { i3 = n; } else { @@ -1203,12 +1334,18 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH); } i3 = indexLength - n; + int32_t prevIndexLength = indexLength; while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) { index16[indexLength++] = index[i + n++]; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + if (hasLongI3Blocks) { + longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); + } } } else { U_ASSERT(f == I3_18); + U_ASSERT(hasLongI3Blocks); // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. int32_t j = i; int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH; @@ -1241,8 +1378,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error index16[k++] = v; index16[k - 9] = upperBits; } while (j < jLimit); - int32_t n = findSameBlock(index16, index3Start, indexLength, - index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); + int32_t n = longI3Blocks.findBlock(index16, index16, indexLength); if (n >= 0) { i3 = n | 0x8000; } else { @@ -1254,6 +1390,7 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); } i3 = (indexLength - n) | 0x8000; + int32_t prevIndexLength = indexLength; if (n > 0) { int32_t start = indexLength; while (n < INDEX_3_18BIT_BLOCK_LENGTH) { @@ -1262,6 +1399,10 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error } else { indexLength += INDEX_3_18BIT_BLOCK_LENGTH; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + if (hasLongI3Blocks) { + longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); + } } } if (index3NullOffset < 0 && i3FirstNull >= 0) { @@ -1284,16 +1425,23 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error } // Compact the index-2 table and write the index-1 table. + static_assert(UCPTRIE_INDEX_2_BLOCK_LENGTH == UCPTRIE_INDEX_3_BLOCK_LENGTH, + "must re-init mixedBlocks"); int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH; int32_t i1 = fastIndexLength; for (int32_t i = 0; i < i2Length; i += blockLength) { - if ((i2Length - i) < blockLength) { + int32_t n; + if ((i2Length - i) >= blockLength) { + // normal block + U_ASSERT(blockLength == UCPTRIE_INDEX_2_BLOCK_LENGTH); + n = mixedBlocks.findBlock(index16, index2, i); + } else { // highStart is inside the last index-2 block. Shorten it. blockLength = i2Length - i; + n = findSameBlock(index16, index3Start, indexLength, + index2, i, blockLength); } int32_t i2; - int32_t n = findSameBlock(index16, index3Start, indexLength, - index2, i, blockLength); if (n >= 0) { i2 = n; } else { @@ -1304,9 +1452,11 @@ int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &error n = getOverlap(index16, indexLength, index2, i, blockLength); } i2 = indexLength - n; + int32_t prevIndexLength = indexLength; while (n < blockLength) { index16[indexLength++] = index2[i + n++]; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); } // Set the index-1 table entry. index16[i1++] = i2; @@ -1374,7 +1524,11 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC uprv_memcpy(newData, asciiData, sizeof(asciiData)); int32_t dataNullIndex = allSameBlocks.findMostUsed(); - int32_t newDataLength = compactData(fastILimit, newData, dataNullIndex); + + MixedBlocks mixedBlocks; + int32_t newDataLength = compactData(fastILimit, newData, newDataCapacity, + dataNullIndex, mixedBlocks, errorCode); + if (U_FAILURE(errorCode)) { return 0; } U_ASSERT(newDataLength <= newDataCapacity); uprv_free(data); data = newData; @@ -1399,7 +1553,7 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET; } - int32_t indexLength = compactIndex(fastILimit, errorCode); + int32_t indexLength = compactIndex(fastILimit, mixedBlocks, errorCode); highStart = realHighStart; return indexLength; } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/MutableCodePointTrie.java b/icu4j/main/classes/core/src/com/ibm/icu/util/MutableCodePointTrie.java index de9e6bc6dd5..7892c9178e0 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/MutableCodePointTrie.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/MutableCodePointTrie.java @@ -526,34 +526,6 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl } /** Search for an identical block. */ - private static int findSameBlock(int[] p, int pStart, int length, - int[] q, int qStart, int blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; - - while (pStart <= length) { - if (equalBlocks(p, pStart, q, qStart, blockLength)) { - return pStart; - } - ++pStart; - } - return -1; - } - - private static int findSameBlock(char[] p, int pStart, int length, - int[] q, int qStart, int blockLength) { - // Ensure that we do not even partially get past length. - length -= blockLength; - - while (pStart <= length) { - if (equalBlocks(p, pStart, q, qStart, blockLength)) { - return pStart; - } - ++pStart; - } - return -1; - } - private static int findSameBlock(char[] p, int pStart, int length, char[] q, int qStart, int blockLength) { // Ensure that we do not even partially get past length. @@ -738,6 +710,208 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl private int[] refCounts = new int[CAPACITY]; } + // Custom hash table for mixed-value blocks to be found anywhere in the + // compacted data or index so far. + private static final class MixedBlocks { + void init(int maxLength, int newBlockLength) { + // We store actual data indexes + 1 to reserve 0 for empty entries. + int maxDataIndex = maxLength - newBlockLength + 1; + int newLength; + if (maxDataIndex <= 0xfff) { // 4k + newLength = 6007; + shift = 12; + mask = 0xfff; + } else if (maxDataIndex <= 0x7fff) { // 32k + newLength = 50021; + shift = 15; + mask = 0x7fff; + } else if (maxDataIndex <= 0x1ffff) { // 128k + newLength = 200003; + shift = 17; + mask = 0x1ffff; + } else { + // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M + newLength = 1500007; + shift = 21; + mask = 0x1fffff; + } + if (table == null || newLength > table.length) { + table = new int[newLength]; + } else { + Arrays.fill(table, 0, newLength, 0); + } + length = newLength; + + blockLength = newBlockLength; + } + + void extend(int[] data, int minStart, int prevDataLength, int newDataLength) { + int start = prevDataLength - blockLength; + if (start >= minStart) { + ++start; // Skip the last block that we added last time. + } else { + start = minStart; // Begin with the first full block. + } + for (int end = newDataLength - blockLength; start <= end; ++start) { + int hashCode = makeHashCode(data, start); + addEntry(data, null, start, hashCode, start); + } + } + + void extend(char[] data, int minStart, int prevDataLength, int newDataLength) { + int start = prevDataLength - blockLength; + if (start >= minStart) { + ++start; // Skip the last block that we added last time. + } else { + start = minStart; // Begin with the first full block. + } + for (int end = newDataLength - blockLength; start <= end; ++start) { + int hashCode = makeHashCode(data, start); + addEntry(null, data, start, hashCode, start); + } + } + + int findBlock(int[] data, int[] blockData, int blockStart) { + int hashCode = makeHashCode(blockData, blockStart); + int entryIndex = findEntry(data, null, blockData, null, blockStart, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + + int findBlock(char[] data, int[] blockData, int blockStart) { + int hashCode = makeHashCode(blockData, blockStart); + int entryIndex = findEntry(null, data, blockData, null, blockStart, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + + int findBlock(char[] data, char[] blockData, int blockStart) { + int hashCode = makeHashCode(blockData, blockStart); + int entryIndex = findEntry(null, data, null, blockData, blockStart, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + + int findAllSameBlock(int[] data, int blockValue) { + int hashCode = makeHashCode(blockValue); + int entryIndex = findEntry(data, blockValue, hashCode); + if (entryIndex >= 0) { + return (table[entryIndex] & mask) - 1; + } else { + return -1; + } + } + + private int makeHashCode(int[] blockData, int blockStart) { + int blockLimit = blockStart + blockLength; + int hashCode = blockData[blockStart++]; + do { + hashCode = 37 * hashCode + blockData[blockStart++]; + } while (blockStart < blockLimit); + return hashCode; + } + + private int makeHashCode(char[] blockData, int blockStart) { + int blockLimit = blockStart + blockLength; + int hashCode = blockData[blockStart++]; + do { + hashCode = 37 * hashCode + blockData[blockStart++]; + } while (blockStart < blockLimit); + return hashCode; + } + + private int makeHashCode(int blockValue) { + int hashCode = blockValue; + for (int i = 1; i < blockLength; ++i) { + hashCode = 37 * hashCode + blockValue; + } + return hashCode; + } + + private void addEntry(int[] data32, char[] data16, int blockStart, int hashCode, int dataIndex) { + assert(0 <= dataIndex && dataIndex < mask); + int entryIndex = findEntry(data32, data16, data32, data16, blockStart, hashCode); + if (entryIndex < 0) { + table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); + } + } + + private int findEntry(int[] data32, char[] data16, + int[] blockData32, char[] blockData16, int blockStart, int hashCode) { + int shiftedHashCode = hashCode << shift; + int initialEntryIndex = modulo(hashCode, length - 1) + 1; // 1..length-1 + for (int entryIndex = initialEntryIndex;;) { + int entry = table[entryIndex]; + if (entry == 0) { + return ~entryIndex; + } + if ((entry & ~mask) == shiftedHashCode) { + int dataIndex = (entry & mask) - 1; + if (data32 != null ? + equalBlocks(data32, dataIndex, blockData32, blockStart, blockLength) : + blockData32 != null ? + equalBlocks(data16, dataIndex, blockData32, blockStart, blockLength) : + equalBlocks(data16, dataIndex, blockData16, blockStart, blockLength)) { + return entryIndex; + } + } + entryIndex = nextIndex(initialEntryIndex, entryIndex); + } + } + + private int findEntry(int[] data, int blockValue, int hashCode) { + int shiftedHashCode = hashCode << shift; + int initialEntryIndex = modulo(hashCode, length - 1) + 1; // 1..length-1 + for (int entryIndex = initialEntryIndex;;) { + int entry = table[entryIndex]; + if (entry == 0) { + return ~entryIndex; + } + if ((entry & ~mask) == shiftedHashCode) { + int dataIndex = (entry & mask) - 1; + if (allValuesSameAs(data, dataIndex, blockLength, blockValue)) { + return entryIndex; + } + } + entryIndex = nextIndex(initialEntryIndex, entryIndex); + } + } + + private int nextIndex(int initialEntryIndex, int entryIndex) { + // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); + return (entryIndex + initialEntryIndex) % length; + } + + /** Ensures non-negative n % m (that is 0..m-1). */ + private int modulo(int n, int m) { + int i = n % m; + if (i < 0) { + i += m; + } + return i; + } + + // Hash table. + // The length is a prime number, larger than the maximum data length. + // The "shift" lower bits store a data index + 1. + // The remaining upper bits store a partial hashCode of the block data values. + private int[] table; + private int length; + private int shift; + private int mask; + + private int blockLength; + } + private int compactWholeDataBlocks(int fastILimit, AllSameBlocks allSameBlocks) { // ASCII data will be stored as a linear table, even if the following code // does not yet count it that way. @@ -836,7 +1010,8 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl * * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. */ - private int compactData(int fastILimit, int[] newData, int dataNullIndex) { + private int compactData( + int fastILimit, int[] newData, int dataNullIndex, MixedBlocks mixedBlocks) { // The linear ASCII data has been copied into newData already. int newDataLength = 0; for (int i = 0; newDataLength < ASCII_LIMIT; @@ -844,8 +1019,11 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl index[i] = newDataLength; } - int iLimit = highStart >> CodePointTrie.SHIFT_3; int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH; + mixedBlocks.init(newData.length, blockLength); + mixedBlocks.extend(newData, 0, 0, newDataLength); + + int iLimit = highStart >> CodePointTrie.SHIFT_3; int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; int fastLength = 0; for (int i = ASCII_I_LIMIT; i < iLimit; i += inc) { @@ -853,11 +1031,14 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH; inc = 1; fastLength = newDataLength; + mixedBlocks.init(newData.length, blockLength); + mixedBlocks.extend(newData, 0, 0, newDataLength); } if (flags[i] == ALL_SAME) { int value = index[i]; // Find an earlier part of the data array of length blockLength // that is filled with this value. + int n = mixedBlocks.findAllSameBlock(newData, value); // If we find a match, and the current block is the data null block, // and it is not a fast block but matches the start of a fast block, // then we need to continue looking. @@ -865,34 +1046,35 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl // and not all of the rest of the fast block is filled with this value. // Otherwise trie.getRange() would detect that the fast block starts at // dataNullOffset and assume incorrectly that it is filled with the null value. - int n; - for (int start = 0; - (n = findAllSameBlock(newData, start, newDataLength, - value, blockLength)) >= 0 && - i == dataNullIndex && i >= fastILimit && n < fastLength && - isStartOfSomeFastBlock(n, index, fastILimit); - start = n + 1) {} + while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && + isStartOfSomeFastBlock(n, index, fastILimit)) { + n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); + } if (n >= 0) { index[i] = n; } else { n = getAllSameOverlap(newData, newDataLength, value, blockLength); index[i] = newDataLength - n; + int prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = value; ++n; } + mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else if (flags[i] == MIXED) { int block = index[i]; - int n = findSameBlock(newData, 0, newDataLength, data, block, blockLength); + int n = mixedBlocks.findBlock(newData, data, block); if (n >= 0) { index[i] = n; } else { n = getOverlap(newData, newDataLength, data, block, blockLength); index[i] = newDataLength - n; + int prevDataLength = newDataLength; while (n < blockLength) { newData[newDataLength++] = data[block + n++]; } + mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); } } else /* SAME_AS */ { int j = index[i]; @@ -903,7 +1085,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl return newDataLength; } - private int compactIndex(int fastILimit) { + private int compactIndex(int fastILimit, MixedBlocks mixedBlocks) { int fastIndexLength = fastILimit >> (CodePointTrie.FAST_SHIFT - CodePointTrie.SHIFT_3); if ((highStart >> CodePointTrie.FAST_SHIFT) <= fastIndexLength) { // Only the linear fast index, no multi-stage index tables. @@ -937,6 +1119,9 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl } } + mixedBlocks.init(fastIndexLength, CodePointTrie.INDEX_3_BLOCK_LENGTH); + mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); + // Examine index-3 blocks. For each determine one of: // - same as the index-3 null block // - same as a fast-index block @@ -947,6 +1132,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl // Also determine an upper limit for the index-3 table length. int index3Capacity = 0; i3FirstNull = index3NullOffset; + boolean hasLongI3Blocks = false; // If the fast index covers the whole BMP, then // the multi-stage index is only for supplementary code points. // Otherwise, the multi-stage index covers all of Unicode. @@ -971,13 +1157,13 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl index3Capacity += CodePointTrie.INDEX_3_BLOCK_LENGTH; } else { index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; + hasLongI3Blocks = true; } i3FirstNull = 0; } } else { if (oredI3 <= 0xffff) { - int n = findSameBlock(fastIndex, 0, fastIndexLength, - index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH); + int n = mixedBlocks.findBlock(fastIndex, index, i); if (n >= 0) { flags[i] = I3_BMP; index[i] = n; @@ -988,6 +1174,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl } else { flags[i] = I3_18; index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; + hasLongI3Blocks = true; } } i = j; @@ -1003,6 +1190,13 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl int index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1; index16 = Arrays.copyOf(fastIndex, index16Capacity); + mixedBlocks.init(index16Capacity, CodePointTrie.INDEX_3_BLOCK_LENGTH); + MixedBlocks longI3Blocks = null; + if (hasLongI3Blocks) { + longI3Blocks = new MixedBlocks(); + longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH); + } + // Compact the index-3 table and write an uncompacted version of the index-2 table. char[] index2 = new char[index2Capacity]; int i2Length = 0; @@ -1022,8 +1216,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl } else if (f == I3_BMP) { i3 = index[i]; } else if (f == I3_16) { - int n = findSameBlock(index16, index3Start, indexLength, - index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH); + int n = mixedBlocks.findBlock(index16, index, i); if (n >= 0) { i3 = n; } else { @@ -1035,12 +1228,18 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH); } i3 = indexLength - n; + int prevIndexLength = indexLength; while (n < CodePointTrie.INDEX_3_BLOCK_LENGTH) { index16[indexLength++] = (char)index[i + n++]; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + if (hasLongI3Blocks) { + longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); + } } } else { assert(f == I3_18); + assert(hasLongI3Blocks); // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. int j = i; int jLimit = i + CodePointTrie.INDEX_3_BLOCK_LENGTH; @@ -1073,8 +1272,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl index16[k++] = (char)v; index16[k - 9] = (char)upperBits; } while (j < jLimit); - int n = findSameBlock(index16, index3Start, indexLength, - index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); + int n = longI3Blocks.findBlock(index16, index16, indexLength); if (n >= 0) { i3 = n | 0x8000; } else { @@ -1086,6 +1284,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); } i3 = (indexLength - n) | 0x8000; + int prevIndexLength = indexLength; if (n > 0) { int start = indexLength; while (n < INDEX_3_18BIT_BLOCK_LENGTH) { @@ -1094,6 +1293,10 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl } else { indexLength += INDEX_3_18BIT_BLOCK_LENGTH; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); + if (hasLongI3Blocks) { + longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); + } } } if (index3NullOffset < 0 && i3FirstNull >= 0) { @@ -1116,16 +1319,23 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl } // Compact the index-2 table and write the index-1 table. + // assert(CodePointTrie.INDEX_2_BLOCK_LENGTH == CodePointTrie.INDEX_3_BLOCK_LENGTH) : + // "must re-init mixedBlocks"; int blockLength = CodePointTrie.INDEX_2_BLOCK_LENGTH; int i1 = fastIndexLength; for (int i = 0; i < i2Length; i += blockLength) { - if ((i2Length - i) < blockLength) { + int n; + if ((i2Length - i) >= blockLength) { + // normal block + assert(blockLength == CodePointTrie.INDEX_2_BLOCK_LENGTH); + n = mixedBlocks.findBlock(index16, index2, i); + } else { // highStart is inside the last index-2 block. Shorten it. blockLength = i2Length - i; + n = findSameBlock(index16, index3Start, indexLength, + index2, i, blockLength); } int i2; - int n = findSameBlock(index16, index3Start, indexLength, - index2, i, blockLength); if (n >= 0) { i2 = n; } else { @@ -1136,9 +1346,11 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl n = getOverlap(index16, indexLength, index2, i, blockLength); } i2 = indexLength - n; + int prevIndexLength = indexLength; while (n < blockLength) { index16[indexLength++] = index2[i + n++]; } + mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); } // Set the index-1 table entry. index16[i1++] = (char)i2; @@ -1186,7 +1398,9 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl int[] newData = Arrays.copyOf(asciiData, newDataCapacity); int dataNullIndex = allSameBlocks.findMostUsed(); - int newDataLength = compactData(fastILimit, newData, dataNullIndex); + + MixedBlocks mixedBlocks = new MixedBlocks(); + int newDataLength = compactData(fastILimit, newData, dataNullIndex, mixedBlocks); assert(newDataLength <= newDataCapacity); data = newData; dataLength = newDataLength; @@ -1203,7 +1417,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl dataNullOffset = CodePointTrie.NO_DATA_NULL_OFFSET; } - int indexLength = compactIndex(fastILimit); + int indexLength = compactIndex(fastILimit, mixedBlocks); highStart = realHighStart; return indexLength; }