ICU-8167 port trie API changes from Java

X-SVN-Rev: 29367
This commit is contained in:
Markus Scherer 2011-01-27 21:41:03 +00:00
parent ff2821ca77
commit 9cc27feeb4
13 changed files with 766 additions and 382 deletions

View file

@ -15,11 +15,16 @@
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "unicode/uobject.h"
#include "cmemory.h"
#include "uassert.h"
#include "bytestrie.h"
U_NAMESPACE_BEGIN
BytesTrie::~BytesTrie() {
uprv_free(ownedArray_);
}
// lead byte already shifted right by 1.
int32_t
BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
@ -178,6 +183,9 @@ BytesTrie::next(int32_t inByte) {
if(pos==NULL) {
return USTRINGTRIE_NO_MATCH;
}
if(inByte<0) {
inByte+=0x100;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Remaining part of a linear-match node.

View file

@ -37,17 +37,47 @@ class UVector32;
* Light-weight, non-const reader class for a BytesTrie.
* Traverses a byte-serialized data structure with minimal state,
* for mapping byte sequences to non-negative integer values.
*
* This class owns the serialized trie data only if it was constructed by
* the builder's build() method.
* The public constructor and the copy constructor only alias the data (only copy the pointer).
* There is no assignment operator.
*
* This class is not intended for public subclassing.
*/
class U_COMMON_API BytesTrie : public UMemory {
public:
/**
* Constructs a BytesTrie reader instance.
* @param trieBytes The trie bytes.
*
* The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
* starting with the first byte of that sequence.
* The BytesTrie object will not read more bytes than
* the BytesTrieBuilder generated in the corresponding build() call.
*
* The array is not copied/cloned and must not be modified while
* the BytesTrie object is in use.
*
* @param trieBytes The byte array that contains the serialized trie.
*/
BytesTrie(const void *trieBytes)
: bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
: ownedArray_(NULL), bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
pos_(bytes_), remainingMatchLength_(-1) {}
/**
* Destructor.
*/
~BytesTrie();
/**
* Copy constructor, copies the other trie reader object and its state,
* but not the byte array which will be shared. (Shallow copy.)
* @param Another BytesTrie object.
*/
BytesTrie(const BytesTrie &other)
: ownedArray_(NULL), bytes_(other.bytes_),
pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
/**
* Resets this trie to its initial state.
*/
@ -108,15 +138,22 @@ public:
/**
* Traverses the trie from the initial state for this input byte.
* Equivalent to reset().next(inByte).
* @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
* Values below -0x100 and above 0xff will never match.
* @return The match/value Result.
*/
inline UStringTrieResult first(int32_t inByte) {
remainingMatchLength_=-1;
if(inByte<0) {
inByte+=0x100;
}
return nextImpl(bytes_, inByte);
}
/**
* Traverses the trie from the current state for this input byte.
* @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
* Values below -0x100 and above 0xff will never match.
* @return The match/value Result.
*/
UStringTrieResult next(int32_t inByte);
@ -262,6 +299,20 @@ public:
private:
friend class BytesTrieBuilder;
/**
* Constructs a BytesTrie reader instance.
* Unlike the public constructor which just aliases an array,
* this constructor adopts the builder's array.
* This constructor is only called by the builder.
*/
BytesTrie(void *adoptBytes, const void *trieBytes)
: ownedArray_(reinterpret_cast<uint8_t *>(adoptBytes)),
bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
pos_(bytes_), remainingMatchLength_(-1) {}
// No assignment operator.
BytesTrie &operator=(const BytesTrie &other);
inline void stop() {
pos_=NULL;
}
@ -407,6 +458,8 @@ private:
static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
uint8_t *ownedArray_;
// Fixed value referencing the BytesTrie bytes.
const uint8_t *bytes_;

View file

@ -31,6 +31,9 @@
enum UStringTrieResult {
/**
* The input unit(s) did not continue a matching string.
* Once current()/next() return USTRINGTRIE_NO_MATCH,
* all further calls to current()/next() will also return USTRINGTRIE_NO_MATCH,
* until the trie is reset to its original state or to a saved state.
*/
USTRINGTRIE_NO_MATCH,
/**

View file

@ -15,6 +15,7 @@
#include <string.h>
#include "unicode/utypes.h"
#include "unicode/localpointer.h"
#include "unicode/stringpiece.h"
#include "bytestrie.h"
#include "bytestriebuilder.h"
@ -29,7 +30,7 @@ struct StringAndValue {
class BytesTrieTest : public IntlTest {
public:
BytesTrieTest() {}
BytesTrieTest();
virtual ~BytesTrieTest();
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
@ -44,7 +45,7 @@ public:
void TestValuesForState();
void TestCompact();
StringPiece buildMonthsTrie(BytesTrieBuilder &builder, UStringTrieBuildOption buildOption);
BytesTrie *buildMonthsTrie(UStringTrieBuildOption buildOption);
void TestHasUniqueValue();
void TestGetNextBytes();
void TestIteratorFromBranch();
@ -52,24 +53,34 @@ public:
void TestTruncatingIteratorFromRoot();
void TestTruncatingIteratorFromLinearMatchShort();
void TestTruncatingIteratorFromLinearMatchLong();
void TestIteratorFromBytes();
void checkData(const StringAndValue data[], int32_t dataLength);
void checkData(const StringAndValue data[], int32_t dataLength, UStringTrieBuildOption buildOption);
StringPiece buildTrie(const StringAndValue data[], int32_t dataLength,
BytesTrieBuilder &builder, UStringTrieBuildOption buildOption);
void checkFirst(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkNext(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkNextWithState(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkNextString(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkIterator(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
BytesTrie *buildTrie(const StringAndValue data[], int32_t dataLength,
UStringTrieBuildOption buildOption);
void checkFirst(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNext(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextWithState(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextString(BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkIterator(const BytesTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkIterator(BytesTrie::Iterator &iter, const StringAndValue data[], int32_t dataLength);
private:
BytesTrieBuilder *builder_;
};
extern IntlTest *createBytesTrieTest() {
return new BytesTrieTest();
}
BytesTrieTest::BytesTrieTest() : builder_(NULL) {
IcuTestErrorCode errorCode(*this, "BytesTrieTest()");
builder_=new BytesTrieBuilder(errorCode);
}
BytesTrieTest::~BytesTrieTest() {
delete builder_;
}
void BytesTrieTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
@ -94,20 +105,22 @@ void BytesTrieTest::runIndexedTest(int32_t index, UBool exec, const char *&name,
TESTCASE_AUTO(TestTruncatingIteratorFromRoot);
TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchShort);
TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchLong);
TESTCASE_AUTO(TestIteratorFromBytes);
TESTCASE_AUTO_END;
}
void BytesTrieTest::TestBuilder() {
IcuTestErrorCode errorCode(*this, "TestBuilder()");
BytesTrieBuilder builder;
builder.build(USTRINGTRIE_BUILD_FAST, errorCode);
builder_->clear();
delete builder_->build(USTRINGTRIE_BUILD_FAST, errorCode);
if(errorCode.reset()!=U_INDEX_OUTOFBOUNDS_ERROR) {
errln("BytesTrieBuilder().build() did not set U_INDEX_OUTOFBOUNDS_ERROR");
return;
}
builder.add("=", 0, errorCode).add("=", 1, errorCode).build(USTRINGTRIE_BUILD_FAST, errorCode);
// TODO: remove .build(...) once add() checks for duplicates.
builder_->add("=", 0, errorCode).add("=", 1, errorCode).build(USTRINGTRIE_BUILD_FAST, errorCode);
if(errorCode.reset()!=U_ILLEGAL_ARGUMENT_ERROR) {
errln("BytesTrieBuilder.build() did not detect duplicates");
errln("BytesTrieBuilder.add() did not detect duplicates");
return;
}
}
@ -250,7 +263,7 @@ void BytesTrieTest::TestCompact() {
checkData(data, LENGTHOF(data));
}
StringPiece BytesTrieTest::buildMonthsTrie(BytesTrieBuilder &builder, UStringTrieBuildOption buildOption) {
BytesTrie *BytesTrieTest::buildMonthsTrie(UStringTrieBuildOption buildOption) {
// All types of nodes leading to the same value,
// for code coverage of recursive functions.
// In particular, we need a lot of branches on some single level
@ -287,111 +300,105 @@ StringPiece BytesTrieTest::buildMonthsTrie(BytesTrieBuilder &builder, UStringTri
{ "jun.", 6 },
{ "june", 6 }
};
return buildTrie(data, LENGTHOF(data), builder, buildOption);
return buildTrie(data, LENGTHOF(data), buildOption);
}
void BytesTrieTest::TestHasUniqueValue() {
BytesTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, USTRINGTRIE_BUILD_FAST);
if(sp.empty()) {
LocalPointer<BytesTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
BytesTrie trie(sp.data());
int32_t uniqueValue;
if(trie.hasUniqueValue(uniqueValue)) {
if(trie->hasUniqueValue(uniqueValue)) {
errln("unique value at root");
}
trie.next('j');
trie.next('a');
trie.next('n');
trie->next('j');
trie->next('a');
trie->next('n');
// hasUniqueValue() directly after next()
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=1) {
if(!trie->hasUniqueValue(uniqueValue) || uniqueValue!=1) {
errln("not unique value 1 after \"jan\"");
}
trie.first('j');
trie.next('u');
if(trie.hasUniqueValue(uniqueValue)) {
trie->first('j');
trie->next('u');
if(trie->hasUniqueValue(uniqueValue)) {
errln("unique value after \"ju\"");
}
if(trie.next('n')!=USTRINGTRIE_INTERMEDIATE_VALUE || 6!=trie.getValue()) {
if(trie->next('n')!=USTRINGTRIE_INTERMEDIATE_VALUE || 6!=trie->getValue()) {
errln("not normal value 6 after \"jun\"");
}
// hasUniqueValue() after getValue()
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=6) {
if(!trie->hasUniqueValue(uniqueValue) || uniqueValue!=6) {
errln("not unique value 6 after \"jun\"");
}
// hasUniqueValue() from within a linear-match node
trie.first('a');
trie.next('u');
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=8) {
trie->first('a');
trie->next('u');
if(!trie->hasUniqueValue(uniqueValue) || uniqueValue!=8) {
errln("not unique value 8 after \"au\"");
}
}
void BytesTrieTest::TestGetNextBytes() {
BytesTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, USTRINGTRIE_BUILD_SMALL);
if(sp.empty()) {
LocalPointer<BytesTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_SMALL));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
BytesTrie trie(sp.data());
char buffer[40];
CheckedArrayByteSink sink(buffer, LENGTHOF(buffer));
int32_t count=trie.getNextBytes(sink);
int32_t count=trie->getNextBytes(sink);
if(count!=2 || sink.NumberOfBytesAppended()!=2 || buffer[0]!='a' || buffer[1]!='j') {
errln("months getNextBytes()!=[aj] at root");
}
trie.next('j');
trie.next('a');
trie.next('n');
trie->next('j');
trie->next('a');
trie->next('n');
// getNextBytes() directly after next()
count=trie.getNextBytes(sink.Reset());
count=trie->getNextBytes(sink.Reset());
buffer[count]=0;
if(count!=20 || sink.NumberOfBytesAppended()!=20 || 0!=strcmp(buffer, ".abcdefghijklmnopqru")) {
errln("months getNextBytes()!=[.abcdefghijklmnopqru] after \"jan\"");
}
// getNextBytes() after getValue()
trie.getValue(); // next() had returned USTRINGTRIE_INTERMEDIATE_VALUE.
trie->getValue(); // next() had returned USTRINGTRIE_INTERMEDIATE_VALUE.
memset(buffer, 0, sizeof(buffer));
count=trie.getNextBytes(sink.Reset());
count=trie->getNextBytes(sink.Reset());
if(count!=20 || sink.NumberOfBytesAppended()!=20 || 0!=strcmp(buffer, ".abcdefghijklmnopqru")) {
errln("months getNextBytes()!=[.abcdefghijklmnopqru] after \"jan\"+getValue()");
}
// getNextBytes() from a linear-match node
trie.next('u');
trie->next('u');
memset(buffer, 0, sizeof(buffer));
count=trie.getNextBytes(sink.Reset());
count=trie->getNextBytes(sink.Reset());
if(count!=1 || sink.NumberOfBytesAppended()!=1 || buffer[0]!='a') {
errln("months getNextBytes()!=[a] after \"janu\"");
}
trie.next('a');
trie->next('a');
memset(buffer, 0, sizeof(buffer));
count=trie.getNextBytes(sink.Reset());
count=trie->getNextBytes(sink.Reset());
if(count!=1 || sink.NumberOfBytesAppended()!=1 || buffer[0]!='r') {
errln("months getNextBytes()!=[r] after \"janua\"");
}
trie.next('r');
trie.next('y');
trie->next('r');
trie->next('y');
// getNextBytes() after a final match
count=trie.getNextBytes(sink.Reset());
count=trie->getNextBytes(sink.Reset());
if(count!=0 || sink.NumberOfBytesAppended()!=0) {
errln("months getNextBytes()!=[] after \"january\"");
}
}
void BytesTrieTest::TestIteratorFromBranch() {
BytesTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, USTRINGTRIE_BUILD_FAST);
if(sp.empty()) {
LocalPointer<BytesTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
BytesTrie trie(sp.data());
// Go to a branch node.
trie.next('j');
trie.next('a');
trie.next('n');
trie->next('j');
trie->next('a');
trie->next('n');
IcuTestErrorCode errorCode(*this, "TestIteratorFromBranch()");
BytesTrie::Iterator iter(trie, 0, errorCode);
BytesTrie::Iterator iter(*trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
@ -431,20 +438,18 @@ void BytesTrieTest::TestIteratorFromBranch() {
}
void BytesTrieTest::TestIteratorFromLinearMatch() {
BytesTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, USTRINGTRIE_BUILD_SMALL);
if(sp.empty()) {
LocalPointer<BytesTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_SMALL));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
BytesTrie trie(sp.data());
// Go into a linear-match node.
trie.next('j');
trie.next('a');
trie.next('n');
trie.next('u');
trie.next('a');
trie->next('j');
trie->next('a');
trie->next('n');
trie->next('u');
trie->next('a');
IcuTestErrorCode errorCode(*this, "TestIteratorFromLinearMatch()");
BytesTrie::Iterator iter(trie, 0, errorCode);
BytesTrie::Iterator iter(*trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
@ -461,13 +466,12 @@ void BytesTrieTest::TestIteratorFromLinearMatch() {
}
void BytesTrieTest::TestTruncatingIteratorFromRoot() {
BytesTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, USTRINGTRIE_BUILD_FAST);
if(sp.empty()) {
LocalPointer<BytesTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromRoot()");
BytesTrie::Iterator iter(sp.data(), 4, errorCode);
BytesTrie::Iterator iter(*trie, 4, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
@ -513,18 +517,16 @@ void BytesTrieTest::TestTruncatingIteratorFromLinearMatchShort() {
{ "abcdepq", 200 },
{ "abcdeyz", 3000 }
};
BytesTrieBuilder builder;
StringPiece sp=buildTrie(data, LENGTHOF(data), builder, USTRINGTRIE_BUILD_FAST);
if(sp.empty()) {
LocalPointer<BytesTrie> trie(buildTrie(data, LENGTHOF(data), USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
BytesTrie trie(sp.data());
// Go into a linear-match node.
trie.next('a');
trie.next('b');
trie->next('a');
trie->next('b');
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchShort()");
// Truncate within the linear-match node.
BytesTrie::Iterator iter(trie, 2, errorCode);
BytesTrie::Iterator iter(*trie, 2, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
@ -543,19 +545,17 @@ void BytesTrieTest::TestTruncatingIteratorFromLinearMatchLong() {
{ "abcdepq", 200 },
{ "abcdeyz", 3000 }
};
BytesTrieBuilder builder;
StringPiece sp=buildTrie(data, LENGTHOF(data), builder, USTRINGTRIE_BUILD_FAST);
if(sp.empty()) {
LocalPointer<BytesTrie> trie(buildTrie(data, LENGTHOF(data), USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
BytesTrie trie(sp.data());
// Go into a linear-match node.
trie.next('a');
trie.next('b');
trie.next('c');
trie->next('a');
trie->next('b');
trie->next('c');
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchLong()");
// Truncate after the linear-match node.
BytesTrie::Iterator iter(trie, 3, errorCode);
BytesTrie::Iterator iter(*trie, 3, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
@ -570,6 +570,22 @@ void BytesTrieTest::TestTruncatingIteratorFromLinearMatchLong() {
checkIterator(iter.reset(), expected, LENGTHOF(expected));
}
void BytesTrieTest::TestIteratorFromBytes() {
static const StringAndValue data[]={
{ "mm", 3 },
{ "mmm", 33 },
{ "mmnop", 333 }
};
builder_->clear();
IcuTestErrorCode errorCode(*this, "TestIteratorFromBytes()");
for(int32_t i=0; i<LENGTHOF(data); ++i) {
builder_->add(data[i].s, data[i].value, errorCode);
}
StringPiece trieBytes=builder_->buildStringPiece(USTRINGTRIE_BUILD_FAST, errorCode);
BytesTrie::Iterator iter(trieBytes.data(), 0, errorCode);
checkIterator(iter, data, LENGTHOF(data));
}
void BytesTrieTest::checkData(const StringAndValue data[], int32_t dataLength) {
logln("checkData(dataLength=%d, fast)", (int)dataLength);
checkData(data, dataLength, USTRINGTRIE_BUILD_FAST);
@ -578,20 +594,19 @@ void BytesTrieTest::checkData(const StringAndValue data[], int32_t dataLength) {
}
void BytesTrieTest::checkData(const StringAndValue data[], int32_t dataLength, UStringTrieBuildOption buildOption) {
BytesTrieBuilder builder;
StringPiece sp=buildTrie(data, dataLength, builder, buildOption);
if(sp.empty()) {
LocalPointer<BytesTrie> trie(buildTrie(data, dataLength, buildOption));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
checkFirst(sp, data, dataLength);
checkNext(sp, data, dataLength);
checkNextWithState(sp, data, dataLength);
checkNextString(sp, data, dataLength);
checkIterator(sp, data, dataLength);
checkFirst(*trie, data, dataLength);
checkNext(*trie, data, dataLength);
checkNextWithState(*trie, data, dataLength);
checkNextString(*trie, data, dataLength);
checkIterator(*trie, data, dataLength);
}
StringPiece BytesTrieTest::buildTrie(const StringAndValue data[], int32_t dataLength,
BytesTrieBuilder &builder, UStringTrieBuildOption buildOption) {
BytesTrie *BytesTrieTest::buildTrie(const StringAndValue data[], int32_t dataLength,
UStringTrieBuildOption buildOption) {
IcuTestErrorCode errorCode(*this, "buildTrie()");
// Add the items to the trie builder in an interesting (not trivial, not random) order.
int32_t index, step;
@ -607,47 +622,61 @@ StringPiece BytesTrieTest::buildTrie(const StringAndValue data[], int32_t dataLe
index=dataLength-1;
step=-1;
}
builder.clear();
builder_->clear();
for(int32_t i=0; i<dataLength; ++i) {
builder.add(data[index].s, data[index].value, errorCode);
builder_->add(data[index].s, data[index].value, errorCode);
index=(index+step)%dataLength;
}
StringPiece sp(builder.build(buildOption, errorCode));
StringPiece sp=builder_->buildStringPiece(buildOption, errorCode);
LocalPointer<BytesTrie> trie(builder_->build(buildOption, errorCode));
if(!errorCode.logIfFailureAndReset("add()/build()")) {
builder.add("zzz", 999, errorCode);
builder_->add("zzz", 999, errorCode);
if(errorCode.reset()!=U_NO_WRITE_PERMISSION) {
errln("builder.build().add(zzz) did not set U_NO_WRITE_PERMISSION");
}
}
logln("serialized trie size: %ld bytes\n", (long)sp.length());
return sp;
StringPiece sp2=builder_->buildStringPiece(buildOption, errorCode);
if(sp.data()==sp2.data()) {
errln("builder.buildStringPiece() before & after build() returned same array");
}
if(errorCode.isFailure()) {
return NULL;
}
// Tries from either build() method should be identical but
// BytesTrie does not implement equals().
// We just return either one.
if((dataLength&1)!=0) {
return trie.orphan();
} else {
return new BytesTrie(sp2.data());
}
}
void BytesTrieTest::checkFirst(const StringPiece &trieBytes,
void BytesTrieTest::checkFirst(BytesTrie &trie,
const StringAndValue data[], int32_t dataLength) {
BytesTrie trie(trieBytes.data());
for(int32_t i=0; i<dataLength; ++i) {
int c=(uint8_t)*data[i].s;
int c=*data[i].s;
if(c==0) {
continue; // skip empty string
}
UStringTrieResult firstResult=trie.first(c);
int32_t firstValue=USTRINGTRIE_HAS_VALUE(firstResult) ? trie.getValue() : -1;
UStringTrieResult nextResult=trie.next((uint8_t)data[i].s[1]);
UStringTrieResult nextResult=trie.next(data[i].s[1]);
if(firstResult!=trie.reset().next(c) ||
firstResult!=trie.current() ||
firstValue!=(USTRINGTRIE_HAS_VALUE(firstResult) ? trie.getValue() : -1) ||
nextResult!=trie.next((uint8_t)data[i].s[1])
nextResult!=trie.next(data[i].s[1])
) {
errln("trie.first(%c)!=trie.reset().next(same) for %s",
c, data[i].s);
}
}
trie.reset();
}
void BytesTrieTest::checkNext(const StringPiece &trieBytes,
void BytesTrieTest::checkNext(BytesTrie &trie,
const StringAndValue data[], int32_t dataLength) {
BytesTrie trie(trieBytes.data());
BytesTrie::State state;
for(int32_t i=0; i<dataLength; ++i) {
int32_t stringLength= (i&1) ? -1 : strlen(data[i].s);
@ -715,9 +744,8 @@ void BytesTrieTest::checkNext(const StringPiece &trieBytes,
}
}
void BytesTrieTest::checkNextWithState(const StringPiece &trieBytes,
void BytesTrieTest::checkNextWithState(BytesTrie &trie,
const StringAndValue data[], int32_t dataLength) {
BytesTrie trie(trieBytes.data());
BytesTrie::State noState, state;
for(int32_t i=0; i<dataLength; ++i) {
if((i&1)==0) {
@ -776,9 +804,8 @@ void BytesTrieTest::checkNextWithState(const StringPiece &trieBytes,
// next(string) is also tested in other functions,
// but here we try to go partway through the string, and then beyond it.
void BytesTrieTest::checkNextString(const StringPiece &trieBytes,
void BytesTrieTest::checkNextString(BytesTrie &trie,
const StringAndValue data[], int32_t dataLength) {
BytesTrie trie(trieBytes.data());
for(int32_t i=0; i<dataLength; ++i) {
const char *expectedString=data[i].s;
int32_t stringLength=strlen(expectedString);
@ -794,11 +821,11 @@ void BytesTrieTest::checkNextString(const StringPiece &trieBytes,
}
}
void BytesTrieTest::checkIterator(const StringPiece &trieBytes,
void BytesTrieTest::checkIterator(const BytesTrie &trie,
const StringAndValue data[], int32_t dataLength) {
IcuTestErrorCode errorCode(*this, "checkIterator()");
BytesTrie::Iterator iter(trieBytes.data(), 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trieBytes) constructor")) {
BytesTrie::Iterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
checkIterator(iter, data, dataLength);

View file

@ -15,6 +15,7 @@
#include <string.h>
#include "unicode/utypes.h"
#include "unicode/localpointer.h"
#include "unicode/uniset.h"
#include "ucharstrie.h"
#include "ucharstriebuilder.h"
@ -29,7 +30,7 @@ struct StringAndValue {
class UCharsTrieTest : public IntlTest {
public:
UCharsTrieTest() {}
UCharsTrieTest();
virtual ~UCharsTrieTest();
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
@ -46,11 +47,10 @@ public:
void TestFirstForCodePoint();
void TestNextForCodePoint();
UBool buildLargeTrie(UCharsTrieBuilder &builder, UnicodeString &result, int32_t numUniqueFirst);
UCharsTrie *buildLargeTrie(int32_t numUniqueFirst);
void TestLargeTrie();
UBool buildMonthsTrie(UCharsTrieBuilder &builder, UStringTrieBuildOption buildOption,
UnicodeString &result);
UCharsTrie *buildMonthsTrie(UStringTrieBuildOption buildOption);
void TestHasUniqueValue();
void TestGetNextUChars();
void TestIteratorFromBranch();
@ -58,24 +58,34 @@ public:
void TestTruncatingIteratorFromRoot();
void TestTruncatingIteratorFromLinearMatchShort();
void TestTruncatingIteratorFromLinearMatchLong();
void TestIteratorFromUChars();
void checkData(const StringAndValue data[], int32_t dataLength);
void checkData(const StringAndValue data[], int32_t dataLength, UStringTrieBuildOption buildOption);
UBool buildTrie(const StringAndValue data[], int32_t dataLength,
UCharsTrieBuilder &builder, UStringTrieBuildOption buildOption, UnicodeString &result);
void checkFirst(const UnicodeString &trieUChars, const StringAndValue data[], int32_t dataLength);
void checkNext(const UnicodeString &trieUChars, const StringAndValue data[], int32_t dataLength);
void checkNextWithState(const UnicodeString &trieUChars, const StringAndValue data[], int32_t dataLength);
void checkNextString(const UnicodeString &trieUChars, const StringAndValue data[], int32_t dataLength);
void checkIterator(const UnicodeString &trieUChars, const StringAndValue data[], int32_t dataLength);
UCharsTrie *buildTrie(const StringAndValue data[], int32_t dataLength,
UStringTrieBuildOption buildOption);
void checkFirst(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNext(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextWithState(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkNextString(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkIterator(UCharsTrie &trie, const StringAndValue data[], int32_t dataLength);
void checkIterator(UCharsTrie::Iterator &iter, const StringAndValue data[], int32_t dataLength);
private:
UCharsTrieBuilder *builder_;
};
extern IntlTest *createUCharsTrieTest() {
return new UCharsTrieTest();
}
UCharsTrieTest::UCharsTrieTest() : builder_(NULL) {
IcuTestErrorCode errorCode(*this, "UCharsTrieTest()");
builder_=new UCharsTrieBuilder(errorCode);
}
UCharsTrieTest::~UCharsTrieTest() {
delete builder_;
}
void UCharsTrieTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
@ -103,21 +113,21 @@ void UCharsTrieTest::runIndexedTest(int32_t index, UBool exec, const char *&name
TESTCASE_AUTO(TestTruncatingIteratorFromRoot);
TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchShort);
TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchLong);
TESTCASE_AUTO(TestIteratorFromUChars);
TESTCASE_AUTO_END;
}
void UCharsTrieTest::TestBuilder() {
IcuTestErrorCode errorCode(*this, "TestBuilder()");
UCharsTrieBuilder builder;
UnicodeString trieUChars;
builder.build(USTRINGTRIE_BUILD_FAST, trieUChars, errorCode);
delete builder_->build(USTRINGTRIE_BUILD_FAST, errorCode);
if(errorCode.reset()!=U_INDEX_OUTOFBOUNDS_ERROR) {
errln("UCharsTrieBuilder().build() did not set U_INDEX_OUTOFBOUNDS_ERROR");
return;
}
builder.add("=", 0, errorCode).add("=", 1, errorCode).build(USTRINGTRIE_BUILD_FAST, trieUChars, errorCode);
// TODO: remove .build(...) once add() checks for duplicates.
builder_->add("=", 0, errorCode).add("=", 1, errorCode).build(USTRINGTRIE_BUILD_FAST, errorCode);
if(errorCode.reset()!=U_ILLEGAL_ARGUMENT_ERROR) {
errln("UCharsTrieBuilder.build() did not detect duplicates");
errln("UCharsTrieBuilder.add() did not detect duplicates");
return;
}
}
@ -281,41 +291,39 @@ void UCharsTrieTest::TestNextForCodePoint() {
{ "\\u4dff\\U00010000\\u9999\\U00020002", 44444 },
{ "\\u4dff\\U000103ff", 99999 }
};
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildTrie(data, LENGTHOF(data), builder, USTRINGTRIE_BUILD_FAST, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildTrie(data, LENGTHOF(data), USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
UCharsTrie trie(trieUChars.getBuffer());
UStringTrieResult result;
if( (result=trie.nextForCodePoint(0x4dff))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x10000))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x9999))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x20000))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0xdfff))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x10ffff))!=USTRINGTRIE_FINAL_VALUE || result!=trie.current() ||
trie.getValue()!=2000000000
if( (result=trie->nextForCodePoint(0x4dff))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x10000))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x9999))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x20000))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0xdfff))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x10ffff))!=USTRINGTRIE_FINAL_VALUE || result!=trie->current() ||
trie->getValue()!=2000000000
) {
errln("UCharsTrie.nextForCodePoint() fails for %s", data[0].s);
}
if( (result=trie.firstForCodePoint(0x4dff))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x10000))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x9999))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x20002))!=USTRINGTRIE_FINAL_VALUE || result!=trie.current() ||
trie.getValue()!=44444
if( (result=trie->firstForCodePoint(0x4dff))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x10000))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x9999))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x20002))!=USTRINGTRIE_FINAL_VALUE || result!=trie->current() ||
trie->getValue()!=44444
) {
errln("UCharsTrie.nextForCodePoint() fails for %s", data[1].s);
}
if( (result=trie.reset().nextForCodePoint(0x4dff))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x10000))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x9999))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x20222))!=USTRINGTRIE_NO_MATCH || result!=trie.current() // no match for trail surrogate
if( (result=trie->reset().nextForCodePoint(0x4dff))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x10000))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x9999))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x20222))!=USTRINGTRIE_NO_MATCH || result!=trie->current() // no match for trail surrogate
) {
errln("UCharsTrie.nextForCodePoint() fails for \\u4dff\\U00010000\\u9999\\U00020222");
}
if( (result=trie.reset().nextForCodePoint(0x4dff))!=USTRINGTRIE_NO_VALUE || result!=trie.current() ||
(result=trie.nextForCodePoint(0x103ff))!=USTRINGTRIE_FINAL_VALUE || result!=trie.current() ||
trie.getValue()!=99999
if( (result=trie->reset().nextForCodePoint(0x4dff))!=USTRINGTRIE_NO_VALUE || result!=trie->current() ||
(result=trie->nextForCodePoint(0x103ff))!=USTRINGTRIE_FINAL_VALUE || result!=trie->current() ||
trie->getValue()!=99999
) {
errln("UCharsTrie.nextForCodePoint() fails for %s", data[2].s);
}
@ -355,43 +363,41 @@ private:
} // end namespace
UBool UCharsTrieTest::buildLargeTrie(UCharsTrieBuilder &builder, UnicodeString &result,
int32_t numUniqueFirst) {
UCharsTrie *UCharsTrieTest::buildLargeTrie(int32_t numUniqueFirst) {
IcuTestErrorCode errorCode(*this, "buildLargeTrie()");
Generator gen;
builder.clear();
builder_->clear();
while(gen.countUniqueFirstChars()<numUniqueFirst) {
builder.add(gen.getString(), gen.getValue(), errorCode);
builder_->add(gen.getString(), gen.getValue(), errorCode);
gen.next();
}
infoln("buildLargeTrie(%ld) added %ld strings", (long)numUniqueFirst, (long)gen.getIndex());
builder.build(USTRINGTRIE_BUILD_FAST, result, errorCode);
logln("serialized trie size: %ld UChars\n", (long)result.length());
return errorCode.isSuccess();
UnicodeString trieUChars;
builder_->buildUnicodeString(USTRINGTRIE_BUILD_FAST, trieUChars, errorCode);
logln("serialized trie size: %ld UChars\n", (long)trieUChars.length());
return new UCharsTrie(trieUChars.getBuffer());
}
// Exercise a large branch node.
void UCharsTrieTest::TestLargeTrie() {
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildLargeTrie(builder, trieUChars, 1111)) {
LocalPointer<UCharsTrie> trie(buildLargeTrie(1111));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
UCharsTrie trie(trieUChars.getBuffer());
Generator gen;
while(gen.countUniqueFirstChars()<1111) {
UnicodeString x(gen.getString());
int32_t value=gen.getValue();
if(!x.isEmpty()) {
if(trie.first(x[0])==USTRINGTRIE_NO_MATCH) {
if(trie->first(x[0])==USTRINGTRIE_NO_MATCH) {
errln("first(first char U+%04X)=USTRINGTRIE_NO_MATCH for string %ld\n",
x[0], (long)gen.getIndex());
break;
}
x.remove(0, 1);
}
UStringTrieResult result=trie.next(x.getBuffer(), x.length());
if(!USTRINGTRIE_HAS_VALUE(result) || result!=trie.current() || value!=trie.getValue()) {
UStringTrieResult result=trie->next(x.getBuffer(), x.length());
if(!USTRINGTRIE_HAS_VALUE(result) || result!=trie->current() || value!=trie->getValue()) {
errln("next(%d chars U+%04X U+%04X)!=hasValue or "
"next()!=current() or getValue() wrong "
"for string %ld\n", (int)x.length(), x[0], x[1], (long)gen.getIndex());
@ -412,8 +418,7 @@ enum {
u_y=0x79
};
UBool UCharsTrieTest::buildMonthsTrie(UCharsTrieBuilder &builder, UStringTrieBuildOption buildOption,
UnicodeString &result) {
UCharsTrie *UCharsTrieTest::buildMonthsTrie(UStringTrieBuildOption buildOption) {
// All types of nodes leading to the same value,
// for code coverage of recursive functions.
// In particular, we need a lot of branches on some single level
@ -450,43 +455,41 @@ UBool UCharsTrieTest::buildMonthsTrie(UCharsTrieBuilder &builder, UStringTrieBui
{ "jun.", 6 },
{ "june", 6 }
};
return buildTrie(data, LENGTHOF(data), builder, buildOption, result);
return buildTrie(data, LENGTHOF(data), buildOption);
}
void UCharsTrieTest::TestHasUniqueValue() {
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildMonthsTrie(builder, USTRINGTRIE_BUILD_FAST, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
UCharsTrie trie(trieUChars.getBuffer());
int32_t uniqueValue;
if(trie.hasUniqueValue(uniqueValue)) {
if(trie->hasUniqueValue(uniqueValue)) {
errln("unique value at root");
}
trie.next(u_j);
trie.next(u_a);
trie.next(u_n);
trie->next(u_j);
trie->next(u_a);
trie->next(u_n);
// hasUniqueValue() directly after next()
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=1) {
if(!trie->hasUniqueValue(uniqueValue) || uniqueValue!=1) {
errln("not unique value 1 after \"jan\"");
}
trie.first(u_j);
trie.next(u_u);
if(trie.hasUniqueValue(uniqueValue)) {
trie->first(u_j);
trie->next(u_u);
if(trie->hasUniqueValue(uniqueValue)) {
errln("unique value after \"ju\"");
}
if(trie.next(u_n)!=USTRINGTRIE_INTERMEDIATE_VALUE || 6!=trie.getValue()) {
if(trie->next(u_n)!=USTRINGTRIE_INTERMEDIATE_VALUE || 6!=trie->getValue()) {
errln("not normal value 6 after \"jun\"");
}
// hasUniqueValue() after getValue()
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=6) {
if(!trie->hasUniqueValue(uniqueValue) || uniqueValue!=6) {
errln("not unique value 6 after \"jun\"");
}
// hasUniqueValue() from within a linear-match node
trie.first(u_a);
trie.next(u_u);
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=8) {
trie->first(u_a);
trie->next(u_u);
if(!trie->hasUniqueValue(uniqueValue) || uniqueValue!=8) {
errln("not unique value 8 after \"au\"");
}
}
@ -501,65 +504,61 @@ private:
};
void UCharsTrieTest::TestGetNextUChars() {
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildMonthsTrie(builder, USTRINGTRIE_BUILD_SMALL, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_SMALL));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
UCharsTrie trie(trieUChars.getBuffer());
UnicodeString buffer;
UnicodeStringAppendable app(buffer);
int32_t count=trie.getNextUChars(app);
int32_t count=trie->getNextUChars(app);
if(count!=2 || buffer.length()!=2 || buffer[0]!=u_a || buffer[1]!=u_j) {
errln("months getNextUChars()!=[aj] at root");
}
trie.next(u_j);
trie.next(u_a);
trie.next(u_n);
trie->next(u_j);
trie->next(u_a);
trie->next(u_n);
// getNextUChars() directly after next()
count=trie.getNextUChars(app.reset());
count=trie->getNextUChars(app.reset());
if(count!=20 || buffer!=UNICODE_STRING_SIMPLE(".abcdefghijklmnopqru")) {
errln("months getNextUChars()!=[.abcdefghijklmnopqru] after \"jan\"");
}
// getNextUChars() after getValue()
trie.getValue(); // next() had returned USTRINGTRIE_INTERMEDIATE_VALUE.
count=trie.getNextUChars(app.reset());
trie->getValue(); // next() had returned USTRINGTRIE_INTERMEDIATE_VALUE.
count=trie->getNextUChars(app.reset());
if(count!=20 || buffer!=UNICODE_STRING_SIMPLE(".abcdefghijklmnopqru")) {
errln("months getNextUChars()!=[.abcdefghijklmnopqru] after \"jan\"+getValue()");
}
// getNextUChars() from a linear-match node
trie.next(u_u);
count=trie.getNextUChars(app.reset());
trie->next(u_u);
count=trie->getNextUChars(app.reset());
if(count!=1 || buffer.length()!=1 || buffer[0]!=u_a) {
errln("months getNextUChars()!=[a] after \"janu\"");
}
trie.next(u_a);
count=trie.getNextUChars(app.reset());
trie->next(u_a);
count=trie->getNextUChars(app.reset());
if(count!=1 || buffer.length()!=1 || buffer[0]!=u_r) {
errln("months getNextUChars()!=[r] after \"janua\"");
}
trie.next(u_r);
trie.next(u_y);
trie->next(u_r);
trie->next(u_y);
// getNextUChars() after a final match
count=trie.getNextUChars(app.reset());
count=trie->getNextUChars(app.reset());
if(count!=0 || buffer.length()!=0) {
errln("months getNextUChars()!=[] after \"january\"");
}
}
void UCharsTrieTest::TestIteratorFromBranch() {
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildMonthsTrie(builder, USTRINGTRIE_BUILD_FAST, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
UCharsTrie trie(trieUChars.getBuffer());
// Go to a branch node.
trie.next(u_j);
trie.next(u_a);
trie.next(u_n);
trie->next(u_j);
trie->next(u_a);
trie->next(u_n);
IcuTestErrorCode errorCode(*this, "TestIteratorFromBranch()");
UCharsTrie::Iterator iter(trie, 0, errorCode);
UCharsTrie::Iterator iter(*trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
@ -599,20 +598,18 @@ void UCharsTrieTest::TestIteratorFromBranch() {
}
void UCharsTrieTest::TestIteratorFromLinearMatch() {
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildMonthsTrie(builder, USTRINGTRIE_BUILD_SMALL, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_SMALL));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
UCharsTrie trie(trieUChars.getBuffer());
// Go into a linear-match node.
trie.next(u_j);
trie.next(u_a);
trie.next(u_n);
trie.next(u_u);
trie.next(u_a);
trie->next(u_j);
trie->next(u_a);
trie->next(u_n);
trie->next(u_u);
trie->next(u_a);
IcuTestErrorCode errorCode(*this, "TestIteratorFromLinearMatch()");
UCharsTrie::Iterator iter(trie, 0, errorCode);
UCharsTrie::Iterator iter(*trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
@ -629,13 +626,12 @@ void UCharsTrieTest::TestIteratorFromLinearMatch() {
}
void UCharsTrieTest::TestTruncatingIteratorFromRoot() {
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildMonthsTrie(builder, USTRINGTRIE_BUILD_FAST, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildMonthsTrie(USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromRoot()");
UCharsTrie::Iterator iter(trieUChars.getBuffer(), 4, errorCode);
UCharsTrie::Iterator iter(*trie, 4, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
@ -681,18 +677,16 @@ void UCharsTrieTest::TestTruncatingIteratorFromLinearMatchShort() {
{ "abcdepq", 200 },
{ "abcdeyz", 3000 }
};
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildTrie(data, LENGTHOF(data), builder, USTRINGTRIE_BUILD_FAST, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildTrie(data, LENGTHOF(data), USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
UCharsTrie trie(trieUChars.getBuffer());
// Go into a linear-match node.
trie.next(u_a);
trie.next(u_b);
trie->next(u_a);
trie->next(u_b);
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchShort()");
// Truncate within the linear-match node.
UCharsTrie::Iterator iter(trie, 2, errorCode);
UCharsTrie::Iterator iter(*trie, 2, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
@ -711,19 +705,17 @@ void UCharsTrieTest::TestTruncatingIteratorFromLinearMatchLong() {
{ "abcdepq", 200 },
{ "abcdeyz", 3000 }
};
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildTrie(data, LENGTHOF(data), builder, USTRINGTRIE_BUILD_FAST, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildTrie(data, LENGTHOF(data), USTRINGTRIE_BUILD_FAST));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
UCharsTrie trie(trieUChars.getBuffer());
// Go into a linear-match node.
trie.next(u_a);
trie.next(u_b);
trie.next(u_c);
trie->next(u_a);
trie->next(u_b);
trie->next(u_c);
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchLong()");
// Truncate after the linear-match node.
UCharsTrie::Iterator iter(trie, 3, errorCode);
UCharsTrie::Iterator iter(*trie, 3, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
@ -738,6 +730,23 @@ void UCharsTrieTest::TestTruncatingIteratorFromLinearMatchLong() {
checkIterator(iter.reset(), expected, LENGTHOF(expected));
}
void UCharsTrieTest::TestIteratorFromUChars() {
static const StringAndValue data[]={
{ "mm", 3 },
{ "mmm", 33 },
{ "mmnop", 333 }
};
builder_->clear();
IcuTestErrorCode errorCode(*this, "TestIteratorFromUChars()");
for(int32_t i=0; i<LENGTHOF(data); ++i) {
builder_->add(data[i].s, data[i].value, errorCode);
}
UnicodeString trieUChars;
builder_->buildUnicodeString(USTRINGTRIE_BUILD_FAST, trieUChars, errorCode);
UCharsTrie::Iterator iter(trieUChars.getBuffer(), 0, errorCode);
checkIterator(iter, data, LENGTHOF(data));
}
void UCharsTrieTest::checkData(const StringAndValue data[], int32_t dataLength) {
logln("checkData(dataLength=%d, fast)", (int)dataLength);
checkData(data, dataLength, USTRINGTRIE_BUILD_FAST);
@ -746,20 +755,19 @@ void UCharsTrieTest::checkData(const StringAndValue data[], int32_t dataLength)
}
void UCharsTrieTest::checkData(const StringAndValue data[], int32_t dataLength, UStringTrieBuildOption buildOption) {
UCharsTrieBuilder builder;
UnicodeString trieUChars;
if(!buildTrie(data, dataLength, builder, buildOption, trieUChars)) {
LocalPointer<UCharsTrie> trie(buildTrie(data, dataLength, buildOption));
if(trie.isNull()) {
return; // buildTrie() reported an error
}
checkFirst(trieUChars, data, dataLength);
checkNext(trieUChars, data, dataLength);
checkNextWithState(trieUChars, data, dataLength);
checkNextString(trieUChars, data, dataLength);
checkIterator(trieUChars, data, dataLength);
checkFirst(*trie, data, dataLength);
checkNext(*trie, data, dataLength);
checkNextWithState(*trie, data, dataLength);
checkNextString(*trie, data, dataLength);
checkIterator(*trie, data, dataLength);
}
UBool UCharsTrieTest::buildTrie(const StringAndValue data[], int32_t dataLength,
UCharsTrieBuilder &builder, UStringTrieBuildOption buildOption, UnicodeString &result) {
UCharsTrie *UCharsTrieTest::buildTrie(const StringAndValue data[], int32_t dataLength,
UStringTrieBuildOption buildOption) {
IcuTestErrorCode errorCode(*this, "buildTrie()");
// Add the items to the trie builder in an interesting (not trivial, not random) order.
int32_t index, step;
@ -775,26 +783,42 @@ UBool UCharsTrieTest::buildTrie(const StringAndValue data[], int32_t dataLength,
index=dataLength-1;
step=-1;
}
builder.clear();
builder_->clear();
for(int32_t i=0; i<dataLength; ++i) {
builder.add(UnicodeString(data[index].s, -1, US_INV).unescape(),
data[index].value, errorCode);
builder_->add(UnicodeString(data[index].s, -1, US_INV).unescape(),
data[index].value, errorCode);
index=(index+step)%dataLength;
}
builder.build(buildOption, result, errorCode);
UnicodeString trieUChars;
builder_->buildUnicodeString(buildOption, trieUChars, errorCode);
LocalPointer<UCharsTrie> trie(builder_->build(buildOption, errorCode));
if(!errorCode.logIfFailureAndReset("add()/build()")) {
builder.add("zzz", 999, errorCode);
builder_->add("zzz", 999, errorCode);
if(errorCode.reset()!=U_NO_WRITE_PERMISSION) {
errln("builder.build().add(zzz) did not set U_NO_WRITE_PERMISSION");
}
}
logln("serialized trie size: %ld UChars\n", (long)result.length());
return errorCode.isSuccess();
logln("serialized trie size: %ld UChars\n", (long)trieUChars.length());
UnicodeString trieUChars2;
builder_->buildUnicodeString(buildOption, trieUChars2, errorCode);
if(trieUChars.getBuffer()==trieUChars2.getBuffer()) {
errln("builder.buildUnicodeString() before & after build() returned same array");
}
if(errorCode.isFailure()) {
return NULL;
}
// Tries from either build() method should be identical but
// UCharsTrie does not implement equals().
// We just return either one.
if((dataLength&1)!=0) {
return trie.orphan();
} else {
return new UCharsTrie(trieUChars2.getBuffer());
}
}
void UCharsTrieTest::checkFirst(const UnicodeString &trieUChars,
void UCharsTrieTest::checkFirst(UCharsTrie &trie,
const StringAndValue data[], int32_t dataLength) {
UCharsTrie trie(trieUChars.getBuffer());
for(int32_t i=0; i<dataLength; ++i) {
if(*data[i].s==0) {
continue; // skip empty string
@ -828,11 +852,11 @@ void UCharsTrieTest::checkFirst(const UnicodeString &trieUChars,
c, data[i].s);
}
}
trie.reset();
}
void UCharsTrieTest::checkNext(const UnicodeString &trieUChars,
void UCharsTrieTest::checkNext(UCharsTrie &trie,
const StringAndValue data[], int32_t dataLength) {
UCharsTrie trie(trieUChars.getBuffer());
UCharsTrie::State state;
for(int32_t i=0; i<dataLength; ++i) {
UnicodeString expectedString=UnicodeString(data[i].s, -1, US_INV).unescape();
@ -905,9 +929,8 @@ void UCharsTrieTest::checkNext(const UnicodeString &trieUChars,
}
}
void UCharsTrieTest::checkNextWithState(const UnicodeString &trieUChars,
void UCharsTrieTest::checkNextWithState(UCharsTrie &trie,
const StringAndValue data[], int32_t dataLength) {
UCharsTrie trie(trieUChars.getBuffer());
UCharsTrie::State noState, state;
for(int32_t i=0; i<dataLength; ++i) {
if((i&1)==0) {
@ -966,9 +989,8 @@ void UCharsTrieTest::checkNextWithState(const UnicodeString &trieUChars,
// next(string) is also tested in other functions,
// but here we try to go partway through the string, and then beyond it.
void UCharsTrieTest::checkNextString(const UnicodeString &trieUChars,
void UCharsTrieTest::checkNextString(UCharsTrie &trie,
const StringAndValue data[], int32_t dataLength) {
UCharsTrie trie(trieUChars.getBuffer());
for(int32_t i=0; i<dataLength; ++i) {
UnicodeString expectedString=UnicodeString(data[i].s, -1, US_INV).unescape();
int32_t stringLength=expectedString.length();
@ -985,10 +1007,10 @@ void UCharsTrieTest::checkNextString(const UnicodeString &trieUChars,
}
}
void UCharsTrieTest::checkIterator(const UnicodeString &trieUChars,
void UCharsTrieTest::checkIterator(UCharsTrie &trie,
const StringAndValue data[], int32_t dataLength) {
IcuTestErrorCode errorCode(*this, "checkIterator()");
UCharsTrie::Iterator iter(trieUChars.getBuffer(), 0, errorCode);
UCharsTrie::Iterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trieUChars) constructor")) {
return;
}

View file

@ -24,6 +24,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "unicode/localpointer.h"
#include "unicode/uperf.h"
#include "unicode/utext.h"
#include "bytestrie.h"
@ -273,6 +274,7 @@ public:
BytesTriePackageLookup(const DictionaryTriePerfTest &perf)
: PackageLookup(perf) {
IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
builder=new BytesTrieBuilder(errorCode);
int32_t count=pkg.getItemCount();
for(int32_t i=0; i<count; ++i) {
// The Package class removes the "icudt46l/" prefix.
@ -288,21 +290,23 @@ public:
// start and limit offset of the data item.
StringPiece fullName(itemNames.toStringPiece());
fullName.remove_prefix(offset);
builder.add(fullName, i, errorCode);
builder->add(fullName, i, errorCode);
// NUL-terminate the name for call() to find the next one.
itemNames.append(0, errorCode);
}
int32_t length=builder.build(USTRINGTRIE_BUILD_SMALL, errorCode).length();
int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length();
printf("size of BytesTrie: %6ld\n", (long)length);
// count+1: +1 for the last-item limit offset which we should have always had
printf("size of dataOffsets:%6ld\n", (long)((count+1)*4));
printf("total index size: %6ld\n", (long)(length+(count+1)*4));
}
virtual ~BytesTriePackageLookup() {}
virtual ~BytesTriePackageLookup() {
delete builder;
}
virtual void call(UErrorCode *pErrorCode) {
int32_t count=pkg.getItemCount();
const char *nameTrieBytes=builder.build(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data();
const char *nameTrieBytes=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data();
const char *name=itemNames.data();
for(int32_t i=0; i<count; ++i) {
if(bytesTrieLookup(name, nameTrieBytes)<0) {
@ -313,7 +317,7 @@ public:
}
protected:
BytesTrieBuilder builder;
BytesTrieBuilder *builder;
CharString itemNames;
};
@ -450,8 +454,9 @@ ucharsTrieMatches(UCharsTrie &trie,
class UCharsTrieDictLookup : public DictLookup {
public:
UCharsTrieDictLookup(const DictionaryTriePerfTest &perfTest)
: DictLookup(perfTest) {
: DictLookup(perfTest), trie(NULL) {
IcuToolErrorCode errorCode("UCharsTrieDictLookup()");
builder=new UCharsTrieBuilder(errorCode);
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
@ -459,17 +464,22 @@ public:
if(lines[i].name[0]<0x41) {
continue;
}
builder.add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode);
builder->add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode);
}
UnicodeString trieUChars;
int32_t length=builder.build(USTRINGTRIE_BUILD_SMALL, trieUChars, errorCode).length();
int32_t length=builder->buildUnicodeString(USTRINGTRIE_BUILD_SMALL, trieUChars, errorCode).length();
printf("size of UCharsTrie: %6ld bytes\n", (long)length*2);
trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode);
}
virtual ~UCharsTrieDictLookup() {}
virtual ~UCharsTrieDictLookup() {
delete builder;
delete trie;
}
protected:
UCharsTrieBuilder builder;
UCharsTrieBuilder *builder;
UCharsTrie *trie;
};
class UCharsTrieDictMatches : public UCharsTrieDictLookup {
@ -478,8 +488,6 @@ public:
: UCharsTrieDictLookup(perfTest) {}
virtual void call(UErrorCode *pErrorCode) {
UnicodeString uchars;
UCharsTrie trie(builder.build(USTRINGTRIE_BUILD_SMALL, uchars, *pErrorCode).getBuffer());
UText text=UTEXT_INITIALIZER;
int32_t lengths[20];
const ULine *lines=perf.getCachedLines();
@ -491,7 +499,7 @@ public:
}
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
int32_t count=0;
ucharsTrieMatches(trie, &text, lines[i].len,
ucharsTrieMatches(*trie, &text, lines[i].len,
lengths, count, LENGTHOF(lengths));
if(count==0 || lengths[count-1]!=lines[i].len) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
@ -505,17 +513,15 @@ public:
UCharsTrieDictContains(const DictionaryTriePerfTest &perfTest)
: UCharsTrieDictLookup(perfTest) {}
virtual void call(UErrorCode *pErrorCode) {
UnicodeString uchars;
UCharsTrie trie(builder.build(USTRINGTRIE_BUILD_SMALL, uchars, *pErrorCode).getBuffer());
virtual void call(UErrorCode * /*pErrorCode*/) {
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
// Skip comment lines (which start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
if(!USTRINGTRIE_HAS_VALUE(trie.reset().next(lines[i].name, lines[i].len))) {
if(!USTRINGTRIE_HAS_VALUE(trie->reset().next(lines[i].name, lines[i].len))) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
}
}
@ -550,8 +556,9 @@ static UBool thaiWordToBytes(const UChar *s, int32_t length,
class BytesTrieDictLookup : public DictLookup {
public:
BytesTrieDictLookup(const DictionaryTriePerfTest &perfTest)
: DictLookup(perfTest), noDict(FALSE) {
: DictLookup(perfTest), trie(NULL), noDict(FALSE) {
IcuToolErrorCode errorCode("BytesTrieDictLookup()");
builder=new BytesTrieBuilder(errorCode);
CharString str;
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
@ -565,18 +572,23 @@ public:
noDict=TRUE;
break;
}
builder.add(str.toStringPiece(), 0, errorCode);
builder->add(str.toStringPiece(), 0, errorCode);
}
if(!noDict) {
int32_t length=builder.build(USTRINGTRIE_BUILD_SMALL, errorCode).length();
int32_t length=builder->buildStringPiece(USTRINGTRIE_BUILD_SMALL, errorCode).length();
printf("size of BytesTrie: %6ld bytes\n", (long)length);
trie=builder->build(USTRINGTRIE_BUILD_SMALL, errorCode);
}
}
virtual ~BytesTrieDictLookup() {}
virtual ~BytesTrieDictLookup() {
delete builder;
delete trie;
}
protected:
BytesTrieBuilder builder;
BytesTrieBuilder *builder;
BytesTrie *trie;
UBool noDict;
};
@ -625,7 +637,6 @@ public:
if(noDict) {
return;
}
BytesTrie trie(builder.build(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data());
UText text=UTEXT_INITIALIZER;
int32_t lengths[20];
const ULine *lines=perf.getCachedLines();
@ -637,7 +648,7 @@ public:
}
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
int32_t count=0;
bytesTrieMatches(trie, &text, lines[i].len,
bytesTrieMatches(*trie, &text, lines[i].len,
lengths, count, LENGTHOF(lengths));
if(count==0 || lengths[count-1]!=lines[i].len) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
@ -651,11 +662,10 @@ public:
BytesTrieDictContains(const DictionaryTriePerfTest &perfTest)
: BytesTrieDictLookup(perfTest) {}
virtual void call(UErrorCode *pErrorCode) {
virtual void call(UErrorCode * /*pErrorCode*/) {
if(noDict) {
return;
}
BytesTrie trie(builder.build(USTRINGTRIE_BUILD_SMALL, *pErrorCode).data());
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
@ -664,14 +674,14 @@ public:
if(line[0]<0x41) {
continue;
}
UStringTrieResult result=trie.first(thaiCharToByte(line[0]));
UStringTrieResult result=trie->first(thaiCharToByte(line[0]));
int32_t lineLength=lines[i].len;
for(int32_t j=1; j<lineLength; ++j) {
if(!USTRINGTRIE_HAS_NEXT(result)) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
break;
}
result=trie.next(thaiCharToByte(line[j]));
result=trie->next(thaiCharToByte(line[j]));
}
if(!USTRINGTRIE_HAS_VALUE(result)) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);

View file

@ -121,6 +121,10 @@ BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharStrin
return diff!=0 ? diff : lengthDiff;
}
BytesTrieBuilder::BytesTrieBuilder(UErrorCode & /*errorCode*/)
: elements(NULL), elementsCapacity(0), elementsLength(0),
bytes(NULL), bytesCapacity(0), bytesLength(0) {}
BytesTrieBuilder::~BytesTrieBuilder() {
delete[] elements;
uprv_free(bytes);
@ -170,39 +174,66 @@ compareElementStrings(const void *context, const void *left, const void *right)
U_CDECL_END
StringPiece
BytesTrie *
BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
StringPiece result;
if(U_FAILURE(errorCode)) {
return result;
}
if(bytesLength>0) {
// Already built.
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
return result;
}
if(elementsLength==0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return result;
}
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
compareElementStrings, &strings,
FALSE, // need not be a stable sort
&errorCode);
if(U_FAILURE(errorCode)) {
return result;
}
// Duplicate strings are not allowed.
StringPiece prev=elements[0].getString(strings);
for(int32_t i=1; i<elementsLength; ++i) {
StringPiece current=elements[i].getString(strings);
if(prev==current) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return result;
buildBytes(buildOption, errorCode);
BytesTrie *newTrie=NULL;
if(U_SUCCESS(errorCode)) {
newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
if(newTrie==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
bytes=NULL; // The new trie now owns the array.
bytesCapacity=0;
}
}
return newTrie;
}
StringPiece
BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
buildBytes(buildOption, errorCode);
StringPiece result;
if(U_SUCCESS(errorCode)) {
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
}
return result;
}
void
BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
if(bytes!=NULL && bytesLength>0) {
// Already built.
return;
}
if(bytesLength==0) {
if(elementsLength==0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
compareElementStrings, &strings,
FALSE, // need not be a stable sort
&errorCode);
if(U_FAILURE(errorCode)) {
return;
}
// Duplicate strings are not allowed.
StringPiece prev=elements[0].getString(strings);
for(int32_t i=1; i<elementsLength; ++i) {
StringPiece current=elements[i].getString(strings);
if(prev==current) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
prev=current;
}
prev=current;
}
// Create and byte-serialize the trie for the elements.
bytesLength=0;
int32_t capacity=strings.length();
if(capacity<1024) {
capacity=1024;
@ -213,17 +244,14 @@ BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCod
if(bytes==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
bytesCapacity=0;
return result;
return;
}
bytesCapacity=capacity;
}
StringTrieBuilder::build(buildOption, elementsLength, errorCode);
if(bytes==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
}
return result;
}
int32_t

View file

@ -27,18 +27,78 @@ class BytesTrieElement;
/**
* Builder class for BytesTrie.
*
* This class is not intended for public subclassing.
*/
class U_TOOLUTIL_API BytesTrieBuilder : public StringTrieBuilder {
public:
BytesTrieBuilder()
: elements(NULL), elementsCapacity(0), elementsLength(0),
bytes(NULL), bytesCapacity(0), bytesLength(0) {}
/**
* Constructs an empty builder.
* @param errorCode Standard ICU error code.
*/
BytesTrieBuilder(UErrorCode &errorCode);
/**
* Destructor.
*/
virtual ~BytesTrieBuilder();
/**
* Adds a (byte sequence, value) pair.
* The byte sequence must be unique.
* The bytes will be copied; the builder does not keep
* a reference to the input StringPiece or its data().
* @param s The input byte sequence.
* @param value The value associated with this byte sequence.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return *this
*/
BytesTrieBuilder &add(const StringPiece &s, int32_t value, UErrorCode &errorCode);
StringPiece build(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
/**
* Builds a BytesTrie for the add()ed data.
* Once built, no further data can be add()ed until clear() is called.
*
* This method passes ownership of the builder's internal result array to the new trie object.
* Another call to any build() variant will re-serialize the trie.
* After clear() has been called, a new array will be used as well.
* @param buildOption Build option, see UStringTrieBuildOption.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return A new BytesTrie for the add()ed data.
*/
BytesTrie *build(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
/**
* Builds a BytesTrie for the add()ed data and byte-serializes it.
* Once built, no further data can be add()ed until clear() is called.
*
* Multiple calls to buildStringPiece() return StringPieces referring to the
* builder's same byte array, without rebuilding.
* If buildStringPiece() is called after build(), the trie will be
* re-serialized into a new array.
* If build() is called after buildStringPiece(), the trie object will become
* the owner of the previously returned array.
* After clear() has been called, a new array will be used as well.
* @param buildOption Build option, see UStringTrieBuildOption.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return A StringPiece which refers to the byte-serialized BytesTrie for the add()ed data.
*/
StringPiece buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
/**
* Removes all (byte sequence, value) pairs.
* New data can then be add()ed and a new trie can be built.
* @return *this
*/
BytesTrieBuilder &clear() {
strings.clear();
elementsLength=0;
@ -47,6 +107,11 @@ public:
}
private:
BytesTrieBuilder(const BytesTrieBuilder &other); // no copy constructor
BytesTrieBuilder &operator=(const BytesTrieBuilder &other); // no assignment operator
void buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
virtual int32_t getElementStringLength(int32_t i) const;
virtual UChar getElementUnit(int32_t i, int32_t byteIndex) const;
virtual int32_t getElementValue(int32_t i) const;

View file

@ -19,8 +19,23 @@
#include "unicode/uobject.h"
#include "uhash.h"
/**
* Build options for BytesTrieBuilder and CharsTrieBuilder.
*/
enum UStringTrieBuildOption {
/**
* Builds a trie quickly.
*/
USTRINGTRIE_BUILD_FAST,
/**
* Builds a trie more slowly, attempting to generate
* a shorter but equivalent serialization.
* This build option also uses more memory.
*
* This option can be effective when many integer values are the same
* and string/byte sequence suffixes can be shared.
* Runtime speed is not expected to improve.
*/
USTRINGTRIE_BUILD_SMALL
};
@ -28,6 +43,8 @@ U_NAMESPACE_BEGIN
/**
* Base class for string trie builder classes.
*
* This class is not intended for public subclassing.
*/
class U_TOOLUTIL_API StringTrieBuilder : public UObject {
public:

View file

@ -14,6 +14,7 @@
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "cmemory.h"
#include "uassert.h"
#include "ucharstrie.h"
@ -48,6 +49,10 @@ Appendable::append(const UChar *s, int32_t length) {
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(Appendable)
UCharsTrie::~UCharsTrie() {
uprv_free(ownedArray_);
}
UStringTrieResult
UCharsTrie::current() const {
const UChar *pos=pos_;

View file

@ -71,17 +71,47 @@ private:
* Light-weight, non-const reader class for a UCharsTrie.
* Traverses a UChar-serialized data structure with minimal state,
* for mapping strings (16-bit-unit sequences) to non-negative integer values.
*
* This class owns the serialized trie data only if it was constructed by
* the builder's build() method.
* The public constructor and the copy constructor only alias the data (only copy the pointer).
* There is no assignment operator.
*
* This class is not intended for public subclassing.
*/
class U_TOOLUTIL_API UCharsTrie : public UMemory {
public:
/**
* Constructs a UCharsTrie reader instance.
* @param trieUChars The trie UChars.
*
* The trieUChars must contain a copy of a UChar sequence from the UCharsTrieBuilder,
* starting with the first UChar of that sequence.
* The UCharsTrie object will not read more UChars than
* the UCharsTrieBuilder generated in the corresponding build() call.
*
* The array is not copied/cloned and must not be modified while
* the UCharsTrie object is in use.
*
* @param trieUChars The UChar array that contains the serialized trie.
*/
UCharsTrie(const UChar *trieUChars)
: uchars_(trieUChars),
: ownedArray_(NULL), uchars_(trieUChars),
pos_(uchars_), remainingMatchLength_(-1) {}
/**
* Destructor.
*/
~UCharsTrie();
/**
* Copy constructor, copies the other trie reader object and its state,
* but not the UChar array which will be shared. (Shallow copy.)
* @param Another UCharsTrie object.
*/
UCharsTrie(const UCharsTrie &other)
: ownedArray_(NULL), uchars_(other.uchars_),
pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
/**
* Resets this trie to its initial state.
*/
@ -142,6 +172,7 @@ public:
/**
* Traverses the trie from the initial state for this input UChar.
* Equivalent to reset().next(uchar).
* @param uchar Input char value. Values below 0 and above 0xffff will never match.
* @return The match/value Result.
*/
inline UStringTrieResult first(int32_t uchar) {
@ -153,6 +184,7 @@ public:
* Traverses the trie from the initial state for the
* one or two UTF-16 code units for this input code point.
* Equivalent to reset().nextForCodePoint(cp).
* @param cp A Unicode code point 0..0x10ffff.
* @return The match/value Result.
*/
inline UStringTrieResult firstForCodePoint(UChar32 cp) {
@ -165,6 +197,7 @@ public:
/**
* Traverses the trie from the current state for this input UChar.
* @param uchar Input char value. Values below 0 and above 0xffff will never match.
* @return The match/value Result.
*/
UStringTrieResult next(int32_t uchar);
@ -172,6 +205,7 @@ public:
/**
* Traverses the trie from the current state for the
* one or two UTF-16 code units for this input code point.
* @param cp A Unicode code point 0..0x10ffff.
* @return The match/value Result.
*/
inline UStringTrieResult nextForCodePoint(UChar32 cp) {
@ -328,6 +362,19 @@ public:
private:
friend class UCharsTrieBuilder;
/**
* Constructs a UCharsTrie reader instance.
* Unlike the public constructor which just aliases an array,
* this constructor adopts the builder's array.
* This constructor is only called by the builder.
*/
UCharsTrie(UChar *adoptUChars, const UChar *trieUChars)
: ownedArray_(adoptUChars), uchars_(trieUChars),
pos_(uchars_), remainingMatchLength_(-1) {}
// No assignment operator.
UCharsTrie &operator=(const UCharsTrie &other);
inline void stop() {
pos_=NULL;
}
@ -513,6 +560,8 @@ private:
static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
UChar *ownedArray_;
// Fixed value referencing the UCharsTrie words.
const UChar *uchars_;

View file

@ -80,6 +80,10 @@ UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const Unicode
return getString(strings).compare(other.getString(strings));
}
UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
: elements(NULL), elementsCapacity(0), elementsLength(0),
uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
UCharsTrieBuilder::~UCharsTrieBuilder() {
delete[] elements;
uprv_free(uchars);
@ -132,42 +136,70 @@ compareElementStrings(const void *context, const void *left, const void *right)
U_CDECL_END
UnicodeString &
UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UnicodeString &result, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return result;
}
if(ucharsLength>0) {
// Already built.
result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
return result;
}
if(elementsLength==0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return result;
}
if(strings.isBogus()) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return result;
}
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
compareElementStrings, &strings,
FALSE, // need not be a stable sort
&errorCode);
if(U_FAILURE(errorCode)) {
return result;
}
// Duplicate strings are not allowed.
UnicodeString prev=elements[0].getString(strings);
for(int32_t i=1; i<elementsLength; ++i) {
UnicodeString current=elements[i].getString(strings);
if(prev==current) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return result;
UCharsTrie *
UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
buildUChars(buildOption, errorCode);
UCharsTrie *newTrie=NULL;
if(U_SUCCESS(errorCode)) {
newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
if(newTrie==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
uchars=NULL; // The new trie now owns the array.
ucharsCapacity=0;
}
}
return newTrie;
}
UnicodeString &
UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
UErrorCode &errorCode) {
buildUChars(buildOption, errorCode);
if(U_SUCCESS(errorCode)) {
result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
}
return result;
}
void
UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
if(uchars!=NULL && ucharsLength>0) {
// Already built.
return;
}
if(ucharsLength==0) {
if(elementsLength==0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
if(strings.isBogus()) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
compareElementStrings, &strings,
FALSE, // need not be a stable sort
&errorCode);
if(U_FAILURE(errorCode)) {
return;
}
// Duplicate strings are not allowed.
UnicodeString prev=elements[0].getString(strings);
for(int32_t i=1; i<elementsLength; ++i) {
UnicodeString current=elements[i].getString(strings);
if(prev==current) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
prev.fastCopyFrom(current);
}
prev.fastCopyFrom(current);
}
// Create and UChar-serialize the trie for the elements.
ucharsLength=0;
int32_t capacity=strings.length();
if(capacity<1024) {
capacity=1024;
@ -178,17 +210,14 @@ UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UnicodeString &resu
if(uchars==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
ucharsCapacity=0;
return result;
return;
}
ucharsCapacity=capacity;
}
StringTrieBuilder::build(buildOption, elementsLength, errorCode);
if(uchars==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
}
return result;
}
int32_t

View file

@ -26,18 +26,81 @@ class UCharsTrieElement;
/**
* Builder class for UCharsTrie.
*
* This class is not intended for public subclassing.
*/
class U_TOOLUTIL_API UCharsTrieBuilder : public StringTrieBuilder {
public:
UCharsTrieBuilder()
: elements(NULL), elementsCapacity(0), elementsLength(0),
uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
/**
* Constructs an empty builder.
* @param errorCode Standard ICU error code.
*/
UCharsTrieBuilder(UErrorCode &errorCode);
/**
* Destructor.
*/
virtual ~UCharsTrieBuilder();
/**
* Adds a (string, value) pair.
* The string must be unique.
* The string contents will be copied; the builder does not keep
* a reference to the input UnicodeString or its buffer.
* @param s The input string.
* @param value The value associated with this string.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return *this
*/
UCharsTrieBuilder &add(const UnicodeString &s, int32_t value, UErrorCode &errorCode);
UnicodeString &build(UStringTrieBuildOption buildOption, UnicodeString &result, UErrorCode &errorCode);
/**
* Builds a UCharsTrie for the add()ed data.
* Once built, no further data can be add()ed until clear() is called.
*
* This method passes ownership of the builder's internal result array to the new trie object.
* Another call to any build() variant will re-serialize the trie.
* After clear() has been called, a new array will be used as well.
* @param buildOption Build option, see UStringTrieBuildOption.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return A new UCharsTrie for the add()ed data.
*/
UCharsTrie *build(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
/**
* Builds a UCharsTrie for the add()ed data and UChar-serializes it.
* Once built, no further data can be add()ed until clear() is called.
*
* Multiple calls to buildUnicodeString() set the UnicodeStrings to the
* builder's same UChar array, without rebuilding.
* If buildUnicodeString() is called after build(), the trie will be
* re-serialized into a new array.
* If build() is called after buildUnicodeString(), the trie object will become
* the owner of the previously returned array.
* After clear() has been called, a new array will be used as well.
* @param buildOption Build option, see UStringTrieBuildOption.
* @param result A UnicodeString which will be set to the UChar-serialized
* UCharsTrie for the add()ed data.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return result
*/
UnicodeString &buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
UErrorCode &errorCode);
/**
* Removes all (string, value) pairs.
* New data can then be add()ed and a new trie can be built.
* @return *this
*/
UCharsTrieBuilder &clear() {
strings.remove();
elementsLength=0;
@ -46,6 +109,11 @@ public:
}
private:
UCharsTrieBuilder(const UCharsTrieBuilder &other); // no copy constructor
UCharsTrieBuilder &operator=(const UCharsTrieBuilder &other); // no assignment operator
void buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode);
virtual int32_t getElementStringLength(int32_t i) const;
virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const;
virtual int32_t getElementValue(int32_t i) const;