mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-13530 fix bugs, add tests, clarify docs (#83)
* ICU-13530 test & fix cptrie.getRange() when small null data block matches the start of a non-null fast data block * ICU-13530 fix test bug * ICU-13530 test & fix bug calculating worst-case data array capacity at start of compaction * ICU-13530 docs: clarify buildImmutable() discards upper bits, trie then returns unsigned ints; range iteration slower than if ranges were stored directly * ICU-13530 accept feedback on docs
This commit is contained in:
parent
e9e50b5aa3
commit
000c266045
9 changed files with 331 additions and 28 deletions
|
@ -274,6 +274,11 @@ U_CFUNC void umutablecptrie_setName(UMutableCPTrie *builder, const char *name);
|
|||
* The header.dataNullOffset (split across two header fields, high bits in header.options)
|
||||
* is the offset of a widely shared data block filled with one single value.
|
||||
* It helps quickly skip over large ranges of data with that value.
|
||||
* The builder must ensure that if the start of any data block (fast or small)
|
||||
* matches the dataNullOffset, then the whole block must be filled with the null value.
|
||||
* Special care must be taken if there is no fast null data block
|
||||
* but a small one, which is shorter, and it matches the *start* of some fast data block.
|
||||
*
|
||||
* Similarly, the header.index3NullOffset is the index-array offset of an index-3 block
|
||||
* where all index entries point to the dataNullOffset.
|
||||
* If there is no such data or index-3 block, then these offsets are set to
|
||||
|
|
|
@ -91,7 +91,7 @@ private:
|
|||
void maskValues(uint32_t mask);
|
||||
UChar32 findHighStart() const;
|
||||
int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks);
|
||||
int32_t compactData(int32_t fastILimit, uint32_t *newData);
|
||||
int32_t compactData(int32_t fastILimit, uint32_t *newData, int32_t dataNullIndex);
|
||||
int32_t compactIndex(int32_t fastILimit, UErrorCode &errorCode);
|
||||
int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode);
|
||||
|
||||
|
@ -599,12 +599,12 @@ int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
|
|||
return -1;
|
||||
}
|
||||
|
||||
int32_t findAllSameBlock(const uint32_t *p, int32_t length, uint32_t value,
|
||||
int32_t blockLength) {
|
||||
// Ensure that we do not even partially get past length.
|
||||
length -= blockLength;
|
||||
int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit,
|
||||
uint32_t value, int32_t blockLength) {
|
||||
// Ensure that we do not even partially get past limit.
|
||||
limit -= blockLength;
|
||||
|
||||
for (int32_t block = 0; block <= length; ++block) {
|
||||
for (int32_t block = start; block <= limit; ++block) {
|
||||
if (p[block] == value) {
|
||||
for (int32_t i = 1;; ++i) {
|
||||
if (i == blockLength) {
|
||||
|
@ -665,6 +665,15 @@ int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value,
|
|||
return length - i;
|
||||
}
|
||||
|
||||
bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) {
|
||||
for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
|
||||
if (index[i] == dataOffset) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the start of the last range in the trie by enumerating backward.
|
||||
* Indexes for code points higher than this will be omitted.
|
||||
|
@ -775,6 +784,9 @@ int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSame
|
|||
// ASCII data will be stored as a linear table, even if the following code
|
||||
// does not yet count it that way.
|
||||
int32_t newDataCapacity = ASCII_LIMIT;
|
||||
// Add room for a small data null block in case it would match the start of
|
||||
// a fast data block where dataNullOffset must not be set in that case.
|
||||
newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
|
||||
// Add room for special values (errorValue, highValue) and padding.
|
||||
newDataCapacity += 4;
|
||||
int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
|
||||
|
@ -816,6 +828,7 @@ int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSame
|
|||
if (getDataBlock(i) < 0) {
|
||||
return -1;
|
||||
}
|
||||
newDataCapacity += blockLength;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -918,7 +931,8 @@ void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value,
|
|||
*
|
||||
* It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
|
||||
*/
|
||||
int32_t MutableCodePointTrie::compactData(int32_t fastILimit, uint32_t *newData) {
|
||||
int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
|
||||
uint32_t *newData, int32_t dataNullIndex) {
|
||||
#ifdef UCPTRIE_DEBUG
|
||||
int32_t countSame=0, sumOverlaps=0;
|
||||
bool printData = dataLength == 29088 /* line.brk */ ||
|
||||
|
@ -941,14 +955,31 @@ int32_t MutableCodePointTrie::compactData(int32_t fastILimit, uint32_t *newData)
|
|||
int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
|
||||
int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
|
||||
int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
|
||||
int32_t fastLength = 0;
|
||||
for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) {
|
||||
if (i == fastILimit) {
|
||||
blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
|
||||
inc = 1;
|
||||
fastLength = newDataLength;
|
||||
}
|
||||
if (flags[i] == ALL_SAME) {
|
||||
uint32_t value = index[i];
|
||||
int32_t n = findAllSameBlock(newData, newDataLength, value, blockLength);
|
||||
int32_t n;
|
||||
// Find an earlier part of the data array of length blockLength
|
||||
// that is filled with this value.
|
||||
// If we find a match, and the current block is the data null block,
|
||||
// and it is not a fast block but matches the start of a fast block,
|
||||
// then we need to continue looking.
|
||||
// This is because this small block is shorter than the fast block,
|
||||
// and not all of the rest of the fast block is filled with this value.
|
||||
// Otherwise trie.getRange() would detect that the fast block starts at
|
||||
// dataNullOffset and assume incorrectly that it is filled with the null value.
|
||||
for (int32_t start = 0;
|
||||
(n = findAllSameBlock(newData, start, newDataLength,
|
||||
value, blockLength)) >= 0 &&
|
||||
i == dataNullIndex && i >= fastILimit && n < fastLength &&
|
||||
isStartOfSomeFastBlock(n, index, fastILimit);
|
||||
start = n + 1) {}
|
||||
if (n >= 0) {
|
||||
DEBUG_DO(++countSame);
|
||||
index[i] = n;
|
||||
|
@ -1306,7 +1337,8 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC
|
|||
}
|
||||
uprv_memcpy(newData, asciiData, sizeof(asciiData));
|
||||
|
||||
int32_t newDataLength = compactData(fastILimit, newData);
|
||||
int32_t dataNullIndex = allSameBlocks.findMostUsed();
|
||||
int32_t newDataLength = compactData(fastILimit, newData, dataNullIndex);
|
||||
U_ASSERT(newDataLength <= newDataCapacity);
|
||||
uprv_free(data);
|
||||
data = newData;
|
||||
|
@ -1318,7 +1350,6 @@ int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorC
|
|||
return 0;
|
||||
}
|
||||
|
||||
int32_t dataNullIndex = allSameBlocks.findMostUsed();
|
||||
if (dataNullIndex >= 0) {
|
||||
dataNullOffset = index[dataNullIndex];
|
||||
#ifdef UCPTRIE_DEBUG
|
||||
|
|
|
@ -89,17 +89,19 @@ enum UCPTrieValueWidth {
|
|||
*/
|
||||
UCPTRIE_VALUE_BITS_ANY = -1,
|
||||
/**
|
||||
* 16 bits per UCPTrie data value.
|
||||
* The trie stores 16 bits per data value.
|
||||
* It returns them as unsigned values 0..0xffff=65535.
|
||||
* @draft ICU 63
|
||||
*/
|
||||
UCPTRIE_VALUE_BITS_16,
|
||||
/**
|
||||
* 32 bits per UCPTrie data value.
|
||||
* The trie stores 32 bits per data value.
|
||||
* @draft ICU 63
|
||||
*/
|
||||
UCPTRIE_VALUE_BITS_32,
|
||||
/**
|
||||
* 8 bits per UCPTrie data value.
|
||||
* The trie stores 8 bits per data value.
|
||||
* It returns them as unsigned values 0..0xff=255.
|
||||
* @draft ICU 63
|
||||
*/
|
||||
UCPTRIE_VALUE_BITS_8
|
||||
|
@ -272,6 +274,8 @@ UCPTrieValueFilter(const void *context, uint32_t value);
|
|||
/**
|
||||
* Returns the last code point such that all those from start to there have the same value.
|
||||
* Can be used to efficiently iterate over all same-value ranges in a trie.
|
||||
* (This is normally faster than iterating over code points and get()ting each value,
|
||||
* but much slower than a data structure that stores ranges directly.)
|
||||
*
|
||||
* If the UCPTrieValueFilter function pointer is not NULL, then
|
||||
* the value to be delivered is passed through that function, and the return value is the end
|
||||
|
|
|
@ -129,6 +129,9 @@ umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c);
|
|||
/**
|
||||
* Returns the last code point such that all those from start to there have the same value.
|
||||
* Can be used to efficiently iterate over all same-value ranges in a trie.
|
||||
* (This is normally faster than iterating over code points and get()ting each value,
|
||||
* but much slower than a data structure that stores ranges directly.)
|
||||
*
|
||||
* The trie can be modified between calls to this function.
|
||||
*
|
||||
* If the UCPTrieValueFilter function pointer is not NULL, then
|
||||
|
@ -189,6 +192,15 @@ umutablecptrie_setRange(UMutableCPTrie *trie,
|
|||
* Compacts the data and builds an immutable UCPTrie according to the parameters.
|
||||
* After this, the mutable trie will be empty.
|
||||
*
|
||||
* The mutable trie stores 32-bit values until buildImmutable() is called.
|
||||
* If values shorter than 32 bits are to be stored in the immutable trie,
|
||||
* then the upper bits are discarded.
|
||||
* For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581,
|
||||
* and the value width is 8 bits, then each of these is stored as 0x81
|
||||
* and the immutable trie will return that as an unsigned value.
|
||||
* (Some implementations may want to make productive temporary use of the upper bits
|
||||
* until buildImmutable() discards them.)
|
||||
*
|
||||
* Not every possible set of mappings can be built into a UCPTrie,
|
||||
* because of limitations resulting from speed and space optimizations.
|
||||
* Every Unicode assigned character can be mapped to a unique value.
|
||||
|
|
|
@ -1490,6 +1490,75 @@ TrieTestGetRangesFixedSurr(void) {
|
|||
umutablecptrie_close(mutableTrie);
|
||||
}
|
||||
|
||||
static void TestSmallNullBlockMatchesFast(void) {
|
||||
// The initial builder+getRange code had a bug:
|
||||
// When there is no null data block in the fast-index range,
|
||||
// but a fast-range data block starts with enough values to match a small data block,
|
||||
// then getRange() got confused.
|
||||
// The builder must prevent this.
|
||||
static const SetRange setRanges[] = {
|
||||
{ 0, 0x880, 1 },
|
||||
// U+0880..U+088F map to initial value 0, potential match for small null data block.
|
||||
{ 0x890, 0x1040, 2 },
|
||||
// U+1040..U+1050 map to 0.
|
||||
// First small null data block in a small-type trie.
|
||||
// In a fast-type trie, it is ok to match a small null data block at U+1041
|
||||
// but not at U+1040.
|
||||
{ 0x1051, 0x10000, 3 },
|
||||
// No fast data block (block length 64) filled with 0 regardless of trie type.
|
||||
// Need more blocks filled with 0 than the largest range above,
|
||||
// and need a highStart above that so that it actually counts.
|
||||
{ 0x20000, 0x110000, 9 }
|
||||
};
|
||||
|
||||
static const CheckRange checkRanges[] = {
|
||||
{ 0x0880, 1 },
|
||||
{ 0x0890, 0 },
|
||||
{ 0x1040, 2 },
|
||||
{ 0x1051, 0 },
|
||||
{ 0x10000, 3 },
|
||||
{ 0x20000, 0 },
|
||||
{ 0x110000, 9 }
|
||||
};
|
||||
|
||||
testTrieRanges("small0-in-fast", FALSE,
|
||||
setRanges, UPRV_LENGTHOF(setRanges),
|
||||
checkRanges, UPRV_LENGTHOF(checkRanges));
|
||||
}
|
||||
|
||||
static void ShortAllSameBlocksTest(void) {
|
||||
static const char *const testName = "short-all-same";
|
||||
// Many all-same-value blocks but only of the small block length used in the mutable trie.
|
||||
// The builder code needs to turn a group of short ALL_SAME blocks below fastLimit
|
||||
// into a MIXED block, and reserve data array capacity for that.
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
UMutableCPTrie *mutableTrie = umutablecptrie_open(0, 0xad, &errorCode);
|
||||
CheckRange checkRanges[0x101];
|
||||
int32_t i;
|
||||
if (U_FAILURE(errorCode)) {
|
||||
log_err("error: umutablecptrie_open(%s) failed: %s\n", testName, u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
for (i = 0; i < 0x1000; i += 0x10) {
|
||||
uint32_t value = i >> 4;
|
||||
umutablecptrie_setRange(mutableTrie, i, i + 0xf, value, &errorCode);
|
||||
checkRanges[value].limit = i + 0x10;
|
||||
checkRanges[value].value = value;
|
||||
}
|
||||
checkRanges[0x100].limit = 0x110000;
|
||||
checkRanges[0x100].value = 0;
|
||||
if (U_FAILURE(errorCode)) {
|
||||
log_err("error: setting values into a mutable trie (%s) failed - %s\n",
|
||||
testName, u_errorName(errorCode));
|
||||
umutablecptrie_close(mutableTrie);
|
||||
return;
|
||||
}
|
||||
|
||||
mutableTrie = testTrieSerializeAllValueWidth(testName, mutableTrie, FALSE,
|
||||
checkRanges, UPRV_LENGTHOF(checkRanges));
|
||||
umutablecptrie_close(mutableTrie);
|
||||
}
|
||||
|
||||
void
|
||||
addUCPTrieTest(TestNode** root) {
|
||||
addTest(root, &TrieTestSet1, "tsutil/ucptrietest/TrieTestSet1");
|
||||
|
@ -1503,4 +1572,6 @@ addUCPTrieTest(TestNode** root) {
|
|||
addTest(root, &ManyAllSameBlocksTest, "tsutil/ucptrietest/ManyAllSameBlocksTest");
|
||||
addTest(root, &MuchDataTest, "tsutil/ucptrietest/MuchDataTest");
|
||||
addTest(root, &TrieTestGetRangesFixedSurr, "tsutil/ucptrietest/TrieTestGetRangesFixedSurr");
|
||||
addTest(root, &TestSmallNullBlockMatchesFast, "tsutil/ucptrietest/TestSmallNullBlockMatchesFast");
|
||||
addTest(root, &ShortAllSameBlocksTest, "tsutil/ucptrietest/ShortAllSameBlocksTest");
|
||||
}
|
||||
|
|
|
@ -331,10 +331,14 @@ public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
|
|||
|
||||
/**
|
||||
* Sets the range object to a range of code points beginning with the start parameter.
|
||||
* The range end is the the last code point such that
|
||||
* The range start is the same as the start input parameter
|
||||
* (even if there are preceding code points that have the same value).
|
||||
* The range end is the last code point such that
|
||||
* all those from start to there have the same value.
|
||||
* Returns false if start is not 0..U+10FFFF.
|
||||
* Can be used to efficiently iterate over all same-value ranges in a map.
|
||||
* (This is normally faster than iterating over code points and get()ting each value,
|
||||
* but may be much slower than a data structure that stores ranges directly.)
|
||||
*
|
||||
* <p>If the {@link ValueFilter} parameter is not null, then
|
||||
* the value to be delivered is passed through that filter, and the return value is the end
|
||||
|
@ -365,7 +369,9 @@ public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
|
|||
|
||||
/**
|
||||
* Sets the range object to a range of code points beginning with the start parameter.
|
||||
* The range end is the the last code point such that
|
||||
* The range start is the same as the start input parameter
|
||||
* (even if there are preceding code points that have the same value).
|
||||
* The range end is the last code point such that
|
||||
* all those from start to there have the same value.
|
||||
* Returns false if start is not 0..U+10FFFF.
|
||||
*
|
||||
|
|
|
@ -70,21 +70,23 @@ public abstract class CodePointTrie extends CodePointMap {
|
|||
*/
|
||||
public enum ValueWidth {
|
||||
/**
|
||||
* 16 bits per CodePointTrie data value.
|
||||
* The trie stores 16 bits per data value.
|
||||
* It returns them as unsigned values 0..0xffff=65535.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
BITS_16,
|
||||
/**
|
||||
* 32 bits per CodePointTrie data value.
|
||||
* The trie stores 32 bits per data value.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
BITS_32,
|
||||
/**
|
||||
* 8 bits per CodePointTrie data value.
|
||||
* The trie stores 8 bits per data value.
|
||||
* It returns them as unsigned values 0..0xff=255.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
|
|
|
@ -157,6 +157,9 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
*
|
||||
* <p>The trie can be modified between calls to this function.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
|
@ -309,6 +312,15 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
* Compacts the data and builds an immutable {@link CodePointTrie} according to the parameters.
|
||||
* After this, the mutable trie will be empty.
|
||||
*
|
||||
* <p>The mutable trie stores 32-bit values until buildImmutable() is called.
|
||||
* If values shorter than 32 bits are to be stored in the immutable trie,
|
||||
* then the upper bits are discarded.
|
||||
* For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581,
|
||||
* and the value width is 8 bits, then each of these is stored as 0x81
|
||||
* and the immutable trie will return that as an unsigned value.
|
||||
* (Some implementations may want to make productive temporary use of the upper bits
|
||||
* until buildImmutable() discards them.)
|
||||
*
|
||||
* <p>Not every possible set of mappings can be built into a CodePointTrie,
|
||||
* because of limitations resulting from speed and space optimizations.
|
||||
* Every Unicode assigned character can be mapped to a unique value.
|
||||
|
@ -556,11 +568,12 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
return -1;
|
||||
}
|
||||
|
||||
private static int findAllSameBlock(int[] p, int length, int value, int blockLength) {
|
||||
// Ensure that we do not even partially get past length.
|
||||
length -= blockLength;
|
||||
private static int findAllSameBlock(int[] p, int start, int limit,
|
||||
int value, int blockLength) {
|
||||
// Ensure that we do not even partially get past limit.
|
||||
limit -= blockLength;
|
||||
|
||||
for (int block = 0; block <= length; ++block) {
|
||||
for (int block = start; block <= limit; ++block) {
|
||||
if (p[block] == value) {
|
||||
for (int i = 1;; ++i) {
|
||||
if (i == blockLength) {
|
||||
|
@ -614,6 +627,15 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
return length - i;
|
||||
}
|
||||
|
||||
private static boolean isStartOfSomeFastBlock(int dataOffset, int[] index, int fastILimit) {
|
||||
for (int i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
|
||||
if (index[i] == dataOffset) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the start of the last range in the trie by enumerating backward.
|
||||
* Indexes for code points higher than this will be omitted.
|
||||
|
@ -720,6 +742,9 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
// ASCII data will be stored as a linear table, even if the following code
|
||||
// does not yet count it that way.
|
||||
int newDataCapacity = ASCII_LIMIT;
|
||||
// Add room for a small data null block in case it would match the start of
|
||||
// a fast data block where dataNullOffset must not be set in that case.
|
||||
newDataCapacity += CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
|
||||
// Add room for special values (errorValue, highValue) and padding.
|
||||
newDataCapacity += 4;
|
||||
int iLimit = highStart >> CodePointTrie.SHIFT_3;
|
||||
|
@ -761,6 +786,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
if (getDataBlock(i) < 0) {
|
||||
return -1;
|
||||
}
|
||||
newDataCapacity += blockLength;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -810,7 +836,7 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
*
|
||||
* It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
|
||||
*/
|
||||
private int compactData(int fastILimit, int[] newData) {
|
||||
private int compactData(int fastILimit, int[] newData, int dataNullIndex) {
|
||||
// The linear ASCII data has been copied into newData already.
|
||||
int newDataLength = 0;
|
||||
for (int i = 0; newDataLength < ASCII_LIMIT;
|
||||
|
@ -821,14 +847,31 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
int iLimit = highStart >> CodePointTrie.SHIFT_3;
|
||||
int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH;
|
||||
int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
|
||||
int fastLength = 0;
|
||||
for (int i = ASCII_I_LIMIT; i < iLimit; i += inc) {
|
||||
if (i == fastILimit) {
|
||||
blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH;
|
||||
inc = 1;
|
||||
fastLength = newDataLength;
|
||||
}
|
||||
if (flags[i] == ALL_SAME) {
|
||||
int value = index[i];
|
||||
int n = findAllSameBlock(newData, newDataLength, value, blockLength);
|
||||
// Find an earlier part of the data array of length blockLength
|
||||
// that is filled with this value.
|
||||
// If we find a match, and the current block is the data null block,
|
||||
// and it is not a fast block but matches the start of a fast block,
|
||||
// then we need to continue looking.
|
||||
// This is because this small block is shorter than the fast block,
|
||||
// and not all of the rest of the fast block is filled with this value.
|
||||
// Otherwise trie.getRange() would detect that the fast block starts at
|
||||
// dataNullOffset and assume incorrectly that it is filled with the null value.
|
||||
int n;
|
||||
for (int start = 0;
|
||||
(n = findAllSameBlock(newData, start, newDataLength,
|
||||
value, blockLength)) >= 0 &&
|
||||
i == dataNullIndex && i >= fastILimit && n < fastLength &&
|
||||
isStartOfSomeFastBlock(n, index, fastILimit);
|
||||
start = n + 1) {}
|
||||
if (n >= 0) {
|
||||
index[i] = n;
|
||||
} else {
|
||||
|
@ -1068,7 +1111,6 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
if (indexLength >= (CodePointTrie.NO_INDEX3_NULL_OFFSET + CodePointTrie.INDEX_3_BLOCK_LENGTH)) {
|
||||
// The index-3 offsets exceed 15 bits, or
|
||||
// the last one cannot be distinguished from the no-null-block value.
|
||||
// TODO: review exceptions / error codes
|
||||
throw new IndexOutOfBoundsException(
|
||||
"The trie data exceeds limitations of the data structure.");
|
||||
}
|
||||
|
@ -1143,18 +1185,17 @@ public final class MutableCodePointTrie extends CodePointMap implements Cloneabl
|
|||
int newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks);
|
||||
int[] newData = Arrays.copyOf(asciiData, newDataCapacity);
|
||||
|
||||
int newDataLength = compactData(fastILimit, newData);
|
||||
int dataNullIndex = allSameBlocks.findMostUsed();
|
||||
int newDataLength = compactData(fastILimit, newData, dataNullIndex);
|
||||
assert(newDataLength <= newDataCapacity);
|
||||
data = newData;
|
||||
dataLength = newDataLength;
|
||||
if (dataLength > (0x3ffff + CodePointTrie.SMALL_DATA_BLOCK_LENGTH)) {
|
||||
// The offset of the last data block is too high to be stored in the index table.
|
||||
// TODO: review exceptions / error codes
|
||||
throw new IndexOutOfBoundsException(
|
||||
"The trie data exceeds limitations of the data structure.");
|
||||
}
|
||||
|
||||
int dataNullIndex = allSameBlocks.findMostUsed();
|
||||
if (dataNullIndex >= 0) {
|
||||
dataNullOffset = index[dataNullIndex];
|
||||
initialValue = data[dataNullOffset];
|
||||
|
|
|
@ -10,6 +10,7 @@ package com.ibm.icu.dev.test.util;
|
|||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.junit.Test;
|
||||
|
@ -18,9 +19,14 @@ import org.junit.runners.JUnit4;
|
|||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.CodePointMap;
|
||||
import com.ibm.icu.util.CodePointTrie;
|
||||
import com.ibm.icu.util.MutableCodePointTrie;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public final class CodePointTrieTest extends TestFmwk {
|
||||
|
@ -32,6 +38,11 @@ public final class CodePointTrieTest extends TestFmwk {
|
|||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return Utility.hex(start) + ".." + Utility.hex(limit - 1) + ':' + Utility.hex(value);
|
||||
}
|
||||
|
||||
final int start, limit;
|
||||
final int value;
|
||||
}
|
||||
|
@ -63,6 +74,11 @@ public final class CodePointTrieTest extends TestFmwk {
|
|||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "≤" + Utility.hex(limit - 1) + ':' + Utility.hex(value);
|
||||
}
|
||||
|
||||
final int limit;
|
||||
final int value;
|
||||
}
|
||||
|
@ -982,4 +998,119 @@ public final class CodePointTrieTest extends TestFmwk {
|
|||
testGetRangesFixedSurr("fixedSurr4", mutableTrie,
|
||||
CodePointMap.RangeOption.FIXED_ALL_SURROGATES, checkRangesFixedSurr4);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestSmallNullBlockMatchesFast() {
|
||||
// The initial builder+getRange code had a bug:
|
||||
// When there is no null data block in the fast-index range,
|
||||
// but a fast-range data block starts with enough values to match a small data block,
|
||||
// then getRange() got confused.
|
||||
// The builder must prevent this.
|
||||
final SetRange setRanges[] = {
|
||||
new SetRange(0, 0x880, 1),
|
||||
// U+0880..U+088F map to initial value 0, potential match for small null data block.
|
||||
new SetRange(0x890, 0x1040, 2),
|
||||
// U+1040..U+1050 map to 0.
|
||||
// First small null data block in a small-type trie.
|
||||
// In a fast-type trie, it is ok to match a small null data block at U+1041
|
||||
// but not at U+1040.
|
||||
new SetRange(0x1051, 0x10000, 3),
|
||||
// No fast data block (block length 64) filled with 0 regardless of trie type.
|
||||
// Need more blocks filled with 0 than the largest range above,
|
||||
// and need a highStart above that so that it actually counts.
|
||||
new SetRange(0x20000, 0x110000, 9)
|
||||
};
|
||||
|
||||
final CheckRange checkRanges[] = {
|
||||
new CheckRange(0x0880, 1),
|
||||
new CheckRange(0x0890, 0),
|
||||
new CheckRange(0x1040, 2),
|
||||
new CheckRange(0x1051, 0),
|
||||
new CheckRange(0x10000, 3),
|
||||
new CheckRange(0x20000, 0),
|
||||
new CheckRange(0x110000, 9)
|
||||
};
|
||||
|
||||
testTrieRanges("small0-in-fast", false, setRanges, checkRanges);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void ShortAllSameBlocksTest() {
|
||||
// Many all-same-value blocks but only of the small block length used in the mutable trie.
|
||||
// The builder code needs to turn a group of short ALL_SAME blocks below fastLimit
|
||||
// into a MIXED block, and reserve data array capacity for that.
|
||||
MutableCodePointTrie mutableTrie = new MutableCodePointTrie(0, 0xad);
|
||||
CheckRange[] checkRanges = new CheckRange[0x101];
|
||||
for (int i = 0; i < 0x1000; i += 0x10) {
|
||||
int value = i >> 4;
|
||||
mutableTrie.setRange(i, i + 0xf, value);
|
||||
checkRanges[value] = new CheckRange(i + 0x10, value);
|
||||
}
|
||||
checkRanges[0x100] = new CheckRange(0x110000, 0);
|
||||
|
||||
mutableTrie = testTrieSerializeAllValueWidth(
|
||||
"short-all-same", mutableTrie, false, checkRanges);
|
||||
}
|
||||
|
||||
private void testIntProperty(String testName, String baseSetPattern, int property) {
|
||||
UnicodeSet uni11 = new UnicodeSet(baseSetPattern);
|
||||
MutableCodePointTrie mutableTrie = new MutableCodePointTrie(0, 0xad);
|
||||
ArrayList<CheckRange> checkRanges = new ArrayList<>();
|
||||
int start = 0;
|
||||
int limit = 0;
|
||||
int prevValue = 0;
|
||||
for (UnicodeSet.EntryRange range : uni11.ranges()) {
|
||||
// Ranges are in ascending order, each range is non-empty,
|
||||
// and there is a gap from one to the next.
|
||||
// Each code point in a range could have a different value.
|
||||
// Any of them can be 0.
|
||||
// We keep initial value 0 between ranges.
|
||||
if (prevValue != 0) {
|
||||
mutableTrie.setRange(start, limit - 1, prevValue);
|
||||
checkRanges.add(new CheckRange(limit, prevValue));
|
||||
start = limit;
|
||||
prevValue = 0;
|
||||
}
|
||||
int c = limit = range.codepoint;
|
||||
do {
|
||||
int value;
|
||||
if (property == UProperty.AGE) {
|
||||
VersionInfo version = UCharacter.getAge(c);
|
||||
value = (version.getMajor() << 4) | version.getMinor();
|
||||
} else {
|
||||
value = UCharacter.getIntPropertyValue(c, property);
|
||||
}
|
||||
if (value != prevValue) {
|
||||
if (start < limit) {
|
||||
if (prevValue != 0) {
|
||||
mutableTrie.setRange(start, limit - 1, prevValue);
|
||||
}
|
||||
checkRanges.add(new CheckRange(limit, prevValue));
|
||||
}
|
||||
start = c;
|
||||
prevValue = value;
|
||||
}
|
||||
limit = ++c;
|
||||
} while (c <= range.codepointEnd);
|
||||
}
|
||||
if (prevValue != 0) {
|
||||
mutableTrie.setRange(start, limit - 1, prevValue);
|
||||
checkRanges.add(new CheckRange(limit, prevValue));
|
||||
}
|
||||
if (limit < 0x110000) {
|
||||
checkRanges.add(new CheckRange(0x110000, 0));
|
||||
}
|
||||
testTrieSerializeAllValueWidth(testName, mutableTrie, false,
|
||||
checkRanges.toArray(new CheckRange[checkRanges.size()]));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void AgePropertyTest() {
|
||||
testIntProperty("age", "[:age=11:]", UProperty.AGE);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void BlockPropertyTest() {
|
||||
testIntProperty("block", "[:^blk=No_Block:]", UProperty.BLOCK);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue