From 767598009dca0b88c6586eeee52452ebb8016795 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 2 Mar 2021 18:26:42 -0800 Subject: [PATCH] ICU-21368 unit-test & fix BytesTrie jump delta encoding --- icu4c/source/common/bytestriebuilder.cpp | 26 ++++++++---- icu4c/source/common/unicode/bytestrie.h | 3 ++ .../source/common/unicode/bytestriebuilder.h | 5 +++ icu4c/source/test/intltest/bytestrietest.cpp | 41 +++++++++++++++++++ .../core/src/com/ibm/icu/util/BytesTrie.java | 9 +++- .../com/ibm/icu/util/BytesTrieBuilder.java | 30 ++++++++++---- .../ibm/icu/dev/test/util/BytesTrieTest.java | 37 +++++++++++++++++ 7 files changed, 132 insertions(+), 19 deletions(-) diff --git a/icu4c/source/common/bytestriebuilder.cpp b/icu4c/source/common/bytestriebuilder.cpp index ec1ab7d8f50..28256f272a7 100644 --- a/icu4c/source/common/bytestriebuilder.cpp +++ b/icu4c/source/common/bytestriebuilder.cpp @@ -474,31 +474,39 @@ BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { U_ASSERT(i>=0); if(i<=BytesTrie::kMaxOneByteDelta) { return write(i); + } else { + char intBytes[5]; + return write(intBytes, internalEncodeDelta(i, intBytes)); } - char intBytes[5]; - int32_t length; +} + +int32_t +BytesTrieBuilder::internalEncodeDelta(int32_t i, char intBytes[]) { + U_ASSERT(i>=0); + if(i<=BytesTrie::kMaxOneByteDelta) { + intBytes[0]=(char)i; + return 1; + } + int32_t length=1; if(i<=BytesTrie::kMaxTwoByteDelta) { intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); - length=1; } else { if(i<=BytesTrie::kMaxThreeByteDelta) { intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); - length=2; } else { if(i<=0xffffff) { intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; - length=3; } else { intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; intBytes[1]=(char)(i>>24); - length=4; + length=2; } - intBytes[1]=(char)(i>>16); + intBytes[length++]=(char)(i>>16); } - intBytes[1]=(char)(i>>8); + intBytes[length++]=(char)(i>>8); } intBytes[length++]=(char)i; - return write(intBytes, length); + return length; } U_NAMESPACE_END diff --git a/icu4c/source/common/unicode/bytestrie.h b/icu4c/source/common/unicode/bytestrie.h index 85f802df420..271a81d1b4d 100644 --- a/icu4c/source/common/unicode/bytestrie.h +++ b/icu4c/source/common/unicode/bytestrie.h @@ -30,6 +30,8 @@ #include "unicode/uobject.h" #include "unicode/ustringtrie.h" +class BytesTrieTest; + U_NAMESPACE_BEGIN class ByteSink; @@ -378,6 +380,7 @@ public: private: friend class BytesTrieBuilder; + friend class ::BytesTrieTest; /** * Constructs a BytesTrie reader instance. diff --git a/icu4c/source/common/unicode/bytestriebuilder.h b/icu4c/source/common/unicode/bytestriebuilder.h index cae16e48b45..3cff89e443d 100644 --- a/icu4c/source/common/unicode/bytestriebuilder.h +++ b/icu4c/source/common/unicode/bytestriebuilder.h @@ -30,6 +30,8 @@ #include "unicode/stringpiece.h" #include "unicode/stringtriebuilder.h" +class BytesTrieTest; + U_NAMESPACE_BEGIN class BytesTrieElement; @@ -125,6 +127,8 @@ public: BytesTrieBuilder &clear(); private: + friend class ::BytesTrieTest; + BytesTrieBuilder(const BytesTrieBuilder &other); // no copy constructor BytesTrieBuilder &operator=(const BytesTrieBuilder &other); // no assignment operator @@ -168,6 +172,7 @@ private: virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal); virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node); virtual int32_t writeDeltaTo(int32_t jumpTarget); + static int32_t internalEncodeDelta(int32_t i, char intBytes[]); CharString *strings; // Pointer not object so we need not #include internal charstr.h. BytesTrieElement *elements; diff --git a/icu4c/source/test/intltest/bytestrietest.cpp b/icu4c/source/test/intltest/bytestrietest.cpp index bdf0b9003bb..3aaa5c9e4fa 100644 --- a/icu4c/source/test/intltest/bytestrietest.cpp +++ b/icu4c/source/test/intltest/bytestrietest.cpp @@ -56,6 +56,7 @@ public: void TestTruncatingIteratorFromLinearMatchLong(); void TestIteratorFromBytes(); void TestFailedIterator(); + void TestDelta(); void checkData(const StringAndValue data[], int32_t dataLength); void checkData(const StringAndValue data[], int32_t dataLength, UStringTrieBuildOption buildOption); @@ -110,6 +111,7 @@ void BytesTrieTest::runIndexedTest(int32_t index, UBool exec, const char *&name, TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchLong); TESTCASE_AUTO(TestIteratorFromBytes); TESTCASE_AUTO(TestFailedIterator); + TESTCASE_AUTO(TestDelta); TESTCASE_AUTO_END; } @@ -599,6 +601,45 @@ void BytesTrieTest::TestFailedIterator() { } } +void BytesTrieTest::TestDelta() { + char intBytes0[5]; + char intBytes1[5]; + static constexpr int32_t sampleDeltas[] = { + -1, 0, 1, 2, 3, 0xa5, 0xbe, 0xbf, + -2, 0xc0, 0xc1, 0xeee, 0x1234, 0x2ffe, 0x2fff, + -3, 0x3000, 0x3001, 0x3003, 0x50005, 0xdfffe, 0xdffff, + -4, 0xe0000, 0xe0001, 0xef0123, 0xfffffe, 0xffffff, + -5, 0x1000000, 0x1000001, 0x7fffffff + }; + int32_t expectedLength = 0; + for (int32_t delta : sampleDeltas) { + if (delta < 0) { + expectedLength = -delta; + continue; + } + // Encoding twice into differently-initialized arrays + // catches bytes that are not written to. + memset(intBytes0, 0, sizeof(intBytes0)); + memset(intBytes1, 1, sizeof(intBytes1)); + int32_t length0 = BytesTrieBuilder::internalEncodeDelta(delta, intBytes0); + int32_t length1 = BytesTrieBuilder::internalEncodeDelta(delta, intBytes1); + assertTrue(UnicodeString(u"non-zero length to encode delta ") + delta, length0 > 0); + assertEquals(UnicodeString(u"consistent length to encode delta ") + delta, length0, length1); + assertEquals(UnicodeString(u"expected length to encode delta ") + delta, + expectedLength, length0); + for (int32_t i = 0; i < length0; ++i) { + uint8_t b0 = intBytes0[i]; + uint8_t b1 = intBytes1[i]; + assertEquals(UnicodeString(u"differently encoded delta ") + delta + + u" at byte index " + i, b0, b1); + } + const uint8_t *start = (const uint8_t *)intBytes0; + const uint8_t *pos = BytesTrie::jumpByDelta(start); + assertEquals(UnicodeString(u"roundtrip for delta ") + delta, + delta, (int32_t)(pos - start) - length0); + } +} + void BytesTrieTest::checkData(const StringAndValue data[], int32_t dataLength) { logln("checkData(dataLength=%d, fast)", (int)dataLength); checkData(data, dataLength, USTRINGTRIE_BUILD_FAST); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/BytesTrie.java b/icu4j/main/classes/core/src/com/ibm/icu/util/BytesTrie.java index 8bc778eece2..5854b2a98fb 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/BytesTrie.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/BytesTrie.java @@ -794,8 +794,13 @@ public final class BytesTrie implements Cloneable, Iterable { return skipValue(pos, leadByte); } - // Reads a jump delta and jumps. - private static int jumpByDelta(byte[] bytes, int pos) { + /** + * Reads a jump delta and jumps. + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public static int jumpByDelta(byte[] bytes, int pos) { int delta=bytes[pos++]&0xff; if(delta=0); if(i<=BytesTrie.kMaxOneByteDelta) { return write(i); + } else { + return write(intBytes, internalEncodeDelta(i, intBytes)); } - int length; + } + /** + * @internal + * @deprecated This API is ICU internal only. + */ + @Deprecated + public static final int internalEncodeDelta(int i, byte[] intBytes) { + assert(i>=0); + if(i<=BytesTrie.kMaxOneByteDelta) { + intBytes[0]=(byte)i; + return 1; + } + int length=1; if(i<=BytesTrie.kMaxTwoByteDelta) { intBytes[0]=(byte)(BytesTrie.kMinTwoByteDeltaLead+(i>>8)); - length=1; } else { if(i<=BytesTrie.kMaxThreeByteDelta) { intBytes[0]=(byte)(BytesTrie.kMinThreeByteDeltaLead+(i>>16)); - length=2; } else { if(i<=0xffffff) { intBytes[0]=(byte)BytesTrie.kFourByteDeltaLead; - length=3; } else { intBytes[0]=(byte)BytesTrie.kFiveByteDeltaLead; intBytes[1]=(byte)(i>>24); - length=4; + length=2; } - intBytes[1]=(byte)(i>>16); + intBytes[length++]=(byte)(i>>16); } - intBytes[1]=(byte)(i>>8); + intBytes[length++]=(byte)(i>>8); } intBytes[length++]=(byte)i; - return write(intBytes, length); + return length; } // Byte serialization of the trie. diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/BytesTrieTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/BytesTrieTest.java index da16f83f607..abe461ed469 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/BytesTrieTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/BytesTrieTest.java @@ -13,6 +13,7 @@ package com.ibm.icu.dev.test.util; import java.nio.ByteBuffer; +import java.util.Arrays; import java.util.NoSuchElementException; import org.junit.Test; @@ -531,6 +532,42 @@ public class BytesTrieTest extends TestFmwk { assertEquals("abc value", 300, copy.getValue()); } + @Test + public void TestDelta() { + byte[] intBytes0 = new byte[5]; + byte[] intBytes1 = new byte[5]; + int[] sampleDeltas = new int[] { + -1, 0, 1, 2, 3, 0xa5, 0xbe, 0xbf, + -2, 0xc0, 0xc1, 0xeee, 0x1234, 0x2ffe, 0x2fff, + -3, 0x3000, 0x3001, 0x3003, 0x50005, 0xdfffe, 0xdffff, + -4, 0xe0000, 0xe0001, 0xef0123, 0xfffffe, 0xffffff, + -5, 0x1000000, 0x1000001, 0x7fffffff + }; + int expectedLength = 0; + for (int delta : sampleDeltas) { + if (delta < 0) { + expectedLength = -delta; + continue; + } + // Encoding twice into differently-initialized arrays + // catches bytes that are not written to. + Arrays.fill(intBytes0, (byte)0); + Arrays.fill(intBytes1, (byte)1); + int length0 = BytesTrieBuilder.internalEncodeDelta(delta, intBytes0); + int length1 = BytesTrieBuilder.internalEncodeDelta(delta, intBytes1); + assertTrue("non-zero length to encode delta " + delta, length0 > 0); + assertEquals("consistent length to encode delta " + delta, length0, length1); + assertEquals("expected length to encode delta " + delta, expectedLength, length0); + for (int i = 0; i < length0; ++i) { + byte b0 = intBytes0[i]; + byte b1 = intBytes1[i]; + assertEquals("differently encoded delta " + delta + " at byte index " + i, b0, b1); + } + int pos = BytesTrie.jumpByDelta(intBytes0, 0); + assertEquals("roundtrip for delta " + delta, delta, pos - length0); + } + } + private void checkData(StringAndValue data[]) { checkData(data, data.length); }