ICU-20735 simpler state saving for C++ string tries

This commit is contained in:
Markus Scherer 2019-08-10 10:51:28 -07:00
parent 8646872f68
commit 527ff9f7ac
6 changed files with 200 additions and 6 deletions

View file

@ -97,6 +97,39 @@ public:
return *this;
}
/**
* Returns the state of this trie as a 64-bit integer.
* The state value is never 0.
*
* @return opaque state value
* @see resetToState64
* @draft ICU 65
*/
uint64_t getState64() const {
return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
(uint64_t)(pos_ - bytes_);
}
/**
* Resets this trie to the saved state.
* Unlike resetToState(State), the 64-bit state value
* must be from getState64() from the same trie object or
* from one initialized the exact same way.
* Because of no validation, this method is faster.
*
* @param state The opaque trie state value from getState64().
* @return *this
* @see getState64
* @see resetToState
* @see reset
* @draft ICU 65
*/
BytesTrie &resetToState64(uint64_t state) {
remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
pos_ = bytes_ + (state & kState64PosMask);
return *this;
}
/**
* BytesTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
@ -505,6 +538,13 @@ private:
static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
// For getState64():
// The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
// so we need at least 5 bits for that.
// We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
static constexpr int32_t kState64RemainingShift = 59;
static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
uint8_t *ownedArray_;
// Fixed value referencing the BytesTrie bytes.

View file

@ -97,6 +97,39 @@ public:
return *this;
}
/**
* Returns the state of this trie as a 64-bit integer.
* The state value is never 0.
*
* @return opaque state value
* @see resetToState64
* @draft ICU 65
*/
uint64_t getState64() const {
return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
(uint64_t)(pos_ - uchars_);
}
/**
* Resets this trie to the saved state.
* Unlike resetToState(State), the 64-bit state value
* must be from getState64() from the same trie object or
* from one initialized the exact same way.
* Because of no validation, this method is faster.
*
* @param state The opaque trie state value from getState64().
* @return *this
* @see getState64
* @see resetToState
* @see reset
* @draft ICU 65
*/
UCharsTrie &resetToState64(uint64_t state) {
remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
pos_ = uchars_ + (state & kState64PosMask);
return *this;
}
/**
* UCharsTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
@ -563,6 +596,13 @@ private:
static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
// For getState64():
// The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
// so we need at least 5 bits for that.
// We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
static constexpr int32_t kState64RemainingShift = 59;
static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
char16_t *ownedArray_;
// Fixed value referencing the UCharsTrie words.

View file

@ -64,6 +64,7 @@ public:
void checkFirst(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNext(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextWithState(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextWithState64(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextString(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkIterator(const BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkIterator(BytesTrie::Iterator &iter, const StringAndValue data[], int32_t dataLength);
@ -613,6 +614,7 @@ void BytesTrieTest::checkData(const StringAndValue data[], int32_t dataLength, U
checkFirst(*trie, data, dataLength);
checkNext(*trie, data, dataLength);
checkNextWithState(*trie, data, dataLength);
checkNextWithState64(*trie, data, dataLength);
checkNextString(*trie, data, dataLength);
checkIterator(*trie, data, dataLength);
}
@ -825,6 +827,61 @@ void BytesTrieTest::checkNextWithState(BytesTrie &trie,
}
}
void BytesTrieTest::checkNextWithState64(BytesTrie &trie,
const StringAndValue data[], int32_t dataLength) {
assertTrue("trie(initial state).getState64()!=0", trie.getState64() != 0);
for(int32_t i=0; i<dataLength; ++i) {
const char *expectedString=data[i].s;
int32_t stringLength= static_cast<int32_t>(strlen(expectedString));
int32_t partialLength = stringLength / 3;
for(int32_t j=0; j<partialLength; ++j) {
if(!USTRINGTRIE_MATCHES(trie.next(expectedString[j]))) {
errln("trie.next()=USTRINGTRIE_NO_MATCH for a prefix of %s", data[i].s);
return;
}
}
uint64_t state = trie.getState64();
assertTrue("trie.getState64()!=0", state != 0);
UStringTrieResult resultAtState=trie.current();
UStringTrieResult result;
int32_t valueAtState=-99;
if(USTRINGTRIE_HAS_VALUE(resultAtState)) {
valueAtState=trie.getValue();
}
result=trie.next(0); // mismatch
if(result!=USTRINGTRIE_NO_MATCH || result!=trie.current()) {
errln("trie.next(0) matched after part of %s", data[i].s);
}
if( resultAtState!=trie.resetToState64(state).current() ||
(USTRINGTRIE_HAS_VALUE(resultAtState) && valueAtState!=trie.getValue())
) {
errln("trie.next(part of %s) changes current()/getValue() after "
"getState64/next(0)/resetToState64",
data[i].s);
} else if(!USTRINGTRIE_HAS_VALUE(
result=trie.next(expectedString+partialLength,
stringLength-partialLength)) ||
result!=trie.current()) {
errln("trie.next(rest of %s) does not seem to contain %s after "
"getState64/next(0)/resetToState64",
data[i].s, data[i].s);
} else if(!USTRINGTRIE_HAS_VALUE(
result=trie.resetToState64(state).
next(expectedString+partialLength,
stringLength-partialLength)) ||
result!=trie.current()) {
errln("trie does not seem to contain %s after getState64/next(rest)/resetToState64",
data[i].s);
} else if(trie.getValue()!=data[i].value) {
errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx",
data[i].s,
(long)trie.getValue(), (long)trie.getValue(),
(long)data[i].value, (long)data[i].value);
}
trie.reset();
}
}
// next(string) is also tested in other functions,
// but here we try to go partway through the string, and then beyond it.
void BytesTrieTest::checkNextString(BytesTrie &trie,

View file

@ -71,6 +71,7 @@ public:
void checkFirst(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNext(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextWithState(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextWithState64(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextString(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkIterator(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkIterator(UCharsTrie::Iterator &iter, const StringAndValue data[], int32_t dataLength);
@ -762,6 +763,7 @@ void UCharsTrieTest::checkData(const StringAndValue data[], int32_t dataLength,
checkFirst(*trie, data, dataLength);
checkNext(*trie, data, dataLength);
checkNextWithState(*trie, data, dataLength);
checkNextWithState64(*trie, data, dataLength);
checkNextString(*trie, data, dataLength);
checkIterator(*trie, data, dataLength);
}
@ -987,6 +989,61 @@ void UCharsTrieTest::checkNextWithState(UCharsTrie &trie,
}
}
void UCharsTrieTest::checkNextWithState64(UCharsTrie &trie,
const StringAndValue data[], int32_t dataLength) {
assertTrue("trie(initial state).getState64()!=0", trie.getState64() != 0);
for(int32_t i=0; i<dataLength; ++i) {
UnicodeString expectedString=UnicodeString(data[i].s, -1, US_INV).unescape();
int32_t stringLength=expectedString.length();
int32_t partialLength = stringLength / 3;
for(int32_t j=0; j<partialLength; ++j) {
if(!USTRINGTRIE_MATCHES(trie.next(expectedString[j]))) {
errln("trie.next()=USTRINGTRIE_NO_MATCH for a prefix of %s", data[i].s);
return;
}
}
uint64_t state = trie.getState64();
assertTrue("trie.getState64()!=0", state != 0);
UStringTrieResult resultAtState=trie.current();
UStringTrieResult result;
int32_t valueAtState=-99;
if(USTRINGTRIE_HAS_VALUE(resultAtState)) {
valueAtState=trie.getValue();
}
result=trie.next(0); // mismatch
if(result!=USTRINGTRIE_NO_MATCH || result!=trie.current()) {
errln("trie.next(0) matched after part of %s", data[i].s);
}
if( resultAtState!=trie.resetToState64(state).current() ||
(USTRINGTRIE_HAS_VALUE(resultAtState) && valueAtState!=trie.getValue())
) {
errln("trie.next(part of %s) changes current()/getValue() after "
"getState64/next(0)/resetToState64",
data[i].s);
} else if(!USTRINGTRIE_HAS_VALUE(
result=trie.next(expectedString.getTerminatedBuffer()+partialLength,
stringLength-partialLength)) ||
result!=trie.current()) {
errln("trie.next(rest of %s) does not seem to contain %s after "
"getState64/next(0)/resetToState64",
data[i].s, data[i].s);
} else if(!USTRINGTRIE_HAS_VALUE(
result=trie.resetToState64(state).
next(expectedString.getTerminatedBuffer()+partialLength,
stringLength-partialLength)) ||
result!=trie.current()) {
errln("trie does not seem to contain %s after getState64/next(rest)/resetToState64",
data[i].s);
} else if(trie.getValue()!=data[i].value) {
errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx",
data[i].s,
(long)trie.getValue(), (long)trie.getValue(),
(long)data[i].value, (long)data[i].value);
}
trie.reset();
}
}
// next(string) is also tested in other functions,
// but here we try to go partway through the string, and then beyond it.
void UCharsTrieTest::checkNextString(UCharsTrie &trie,

View file

@ -768,16 +768,16 @@ public class BytesTrieTest extends TestFmwk {
(resultAtState.hasValue() && valueAtState!=trie.getValue())
) {
errln("trie.next(part of "+data[i].s+") changes current()/getValue() after "+
"saveState/next(0)/resetToState");
"getState64/next(0)/resetToState64");
} else if(!(result=trie.next(expectedString, partialLength, stringLength)).hasValue() ||
result!=trie.current()) {
errln("trie.next(rest of "+data[i].s+") does not seem to contain "+data[i].s+" after "+
"saveState/next(0)/resetToState");
"getState64/next(0)/resetToState64");
} else if(!(result=trie.resetToState64(state).
next(expectedString, partialLength, stringLength)).hasValue() ||
result!=trie.current()) {
errln("trie does not seem to contain "+data[i].s+
" after saveState/next(rest)/resetToState");
" after getState64/next(rest)/resetToState64");
} else if(trie.getValue()!=data[i].value) {
errln(String.format("trie value for %s is %d=0x%x instead of expected %d=0x%x",
data[i].s,

View file

@ -914,16 +914,16 @@ public class CharsTrieTest extends TestFmwk {
(resultAtState.hasValue() && valueAtState!=trie.getValue())
) {
errln("trie.next(part of "+data[i].s+") changes current()/getValue() after "+
"saveState/next(0)/resetToState");
"getState64/next(0)/resetToState64");
} else if(!(result=trie.next(expectedString, partialLength, stringLength)).hasValue() ||
result!=trie.current()) {
errln("trie.next(rest of "+data[i].s+") does not seem to contain "+data[i].s+" after "+
"saveState/next(0)/resetToState");
"getState64/next(0)/resetToState64");
} else if(!(result=trie.resetToState64(state).
next(expectedString, partialLength, stringLength)).hasValue() ||
result!=trie.current()) {
errln("trie does not seem to contain "+data[i].s+
" after saveState/next(rest)/resetToState");
" after getState64/next(rest)/resetToState64");
} else if(trie.getValue()!=data[i].value) {
errln(String.format("trie value for %s is %d=0x%x instead of expected %d=0x%x",
data[i].s,