mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 14:05:32 +00:00
ICU-8105 hardcode Unicode property names data (formatVersion 2); includes new dictionary-type tries (ByteTrie & UCharTrie see ticket #8167); merge branches/markus/tries -r 29040:29249
X-SVN-Rev: 29252
This commit is contained in:
parent
3e29cb9f1f
commit
c04082d93c
53 changed files with 9209 additions and 1067 deletions
|
@ -85,7 +85,7 @@ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \
|
|||
ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \
|
||||
uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \
|
||||
ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o loclikely.o locresdata.o \
|
||||
bytestream.o stringpiece.o \
|
||||
bytestream.o stringpiece.o bytetrie.o \
|
||||
ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
|
||||
utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
|
||||
normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o unorm_it.o \
|
||||
|
|
431
icu4c/source/common/bytetrie.cpp
Normal file
431
icu4c/source/common/bytetrie.cpp
Normal file
|
@ -0,0 +1,431 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetrie.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010sep25
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "uassert.h"
|
||||
#include "bytetrie.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// lead byte already shifted right by 1.
|
||||
int32_t
|
||||
ByteTrie::readValue(const uint8_t *pos, int32_t leadByte) {
|
||||
int32_t value;
|
||||
if(leadByte<kMinTwoByteValueLead) {
|
||||
value=leadByte-kMinOneByteValueLead;
|
||||
} else if(leadByte<kMinThreeByteValueLead) {
|
||||
value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
|
||||
} else if(leadByte<kFourByteValueLead) {
|
||||
value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
|
||||
} else if(leadByte==kFourByteValueLead) {
|
||||
value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
|
||||
} else {
|
||||
value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
const uint8_t *
|
||||
ByteTrie::jumpByDelta(const uint8_t *pos) {
|
||||
int32_t delta=*pos++;
|
||||
if(delta<kMinTwoByteDeltaLead) {
|
||||
// nothing to do
|
||||
} else if(delta<kMinThreeByteDeltaLead) {
|
||||
delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
|
||||
} else if(delta<kFourByteDeltaLead) {
|
||||
delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
|
||||
pos+=2;
|
||||
} else if(delta==kFourByteDeltaLead) {
|
||||
delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
|
||||
pos+=3;
|
||||
} else {
|
||||
delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
|
||||
pos+=4;
|
||||
}
|
||||
return pos+delta;
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
ByteTrie::current() const {
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
} else {
|
||||
int32_t node;
|
||||
return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
ByteTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
|
||||
// Branch according to the current byte.
|
||||
if(length==0) {
|
||||
length=*pos++;
|
||||
}
|
||||
++length;
|
||||
// The length of the branch is the number of bytes to select from.
|
||||
// The data structure encodes a binary search.
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
if(inByte<*pos++) {
|
||||
length>>=1;
|
||||
pos=jumpByDelta(pos);
|
||||
} else {
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
}
|
||||
// Drop down to linear search for the last few bytes.
|
||||
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
|
||||
// and divides length by 2.
|
||||
do {
|
||||
if(inByte==*pos++) {
|
||||
UDictTrieResult result;
|
||||
int32_t node=*pos;
|
||||
U_ASSERT(node>=kMinValueLead);
|
||||
if(node&kValueIsFinal) {
|
||||
// Leave the final value for getValue() to read.
|
||||
result=UDICTTRIE_HAS_FINAL_VALUE;
|
||||
} else {
|
||||
// Use the non-final value as the jump delta.
|
||||
++pos;
|
||||
// int32_t delta=readValue(pos, node>>1);
|
||||
node>>=1;
|
||||
int32_t delta;
|
||||
if(node<kMinTwoByteValueLead) {
|
||||
delta=node-kMinOneByteValueLead;
|
||||
} else if(node<kMinThreeByteValueLead) {
|
||||
delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
|
||||
} else if(node<kFourByteValueLead) {
|
||||
delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
|
||||
pos+=2;
|
||||
} else if(node==kFourByteValueLead) {
|
||||
delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
|
||||
pos+=3;
|
||||
} else {
|
||||
delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
|
||||
pos+=4;
|
||||
}
|
||||
// end readValue()
|
||||
pos+=delta;
|
||||
node=*pos;
|
||||
result= node>=kMinValueLead ? valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
}
|
||||
pos_=pos;
|
||||
return result;
|
||||
}
|
||||
--length;
|
||||
pos=skipValue(pos);
|
||||
} while(length>1);
|
||||
if(inByte==*pos++) {
|
||||
pos_=pos;
|
||||
int32_t node=*pos;
|
||||
return node>=kMinValueLead ? valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
} else {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
ByteTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node<kMinLinearMatch) {
|
||||
return branchNext(pos, node, inByte);
|
||||
} else if(node<kMinValueLead) {
|
||||
// Match the first of length+1 bytes.
|
||||
int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
|
||||
if(inByte==*pos++) {
|
||||
remainingMatchLength_=--length;
|
||||
pos_=pos;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
} else {
|
||||
// No match.
|
||||
break;
|
||||
}
|
||||
} else if(node&kValueIsFinal) {
|
||||
// No further matching bytes.
|
||||
break;
|
||||
} else {
|
||||
// Skip intermediate value.
|
||||
pos=skipValue(pos, node);
|
||||
// The next node must not also be a value node.
|
||||
U_ASSERT(*pos<kMinValueLead);
|
||||
}
|
||||
}
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
ByteTrie::next(int32_t inByte) {
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
if(length>=0) {
|
||||
// Remaining part of a linear-match node.
|
||||
if(inByte==*pos++) {
|
||||
remainingMatchLength_=--length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
} else {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
}
|
||||
return nextImpl(pos, inByte);
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
ByteTrie::next(const char *s, int32_t sLength) {
|
||||
if(sLength<0 ? *s==0 : sLength==0) {
|
||||
// Empty input.
|
||||
return current();
|
||||
}
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
for(;;) {
|
||||
// Fetch the next input byte, if there is one.
|
||||
// Continue a linear-match node without rechecking sLength<0.
|
||||
int32_t inByte;
|
||||
if(sLength<0) {
|
||||
for(;;) {
|
||||
if((inByte=*s++)==0) {
|
||||
remainingMatchLength_=length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
}
|
||||
if(length<0) {
|
||||
remainingMatchLength_=length;
|
||||
break;
|
||||
}
|
||||
if(inByte!=*pos) {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
}
|
||||
} else {
|
||||
for(;;) {
|
||||
if(sLength==0) {
|
||||
remainingMatchLength_=length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
}
|
||||
inByte=*s++;
|
||||
--sLength;
|
||||
if(length<0) {
|
||||
remainingMatchLength_=length;
|
||||
break;
|
||||
}
|
||||
if(inByte!=*pos) {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
}
|
||||
}
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node<kMinLinearMatch) {
|
||||
UDictTrieResult result=branchNext(pos, node, inByte);
|
||||
if(result==UDICTTRIE_NO_MATCH) {
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
// Fetch the next input byte, if there is one.
|
||||
if(sLength<0) {
|
||||
if((inByte=*s++)==0) {
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
if(sLength==0) {
|
||||
return result;
|
||||
}
|
||||
inByte=*s++;
|
||||
--sLength;
|
||||
}
|
||||
if(result==UDICTTRIE_HAS_FINAL_VALUE) {
|
||||
// No further matching bytes.
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
|
||||
} else if(node<kMinValueLead) {
|
||||
// Match length+1 bytes.
|
||||
length=node-kMinLinearMatch; // Actual match length minus 1.
|
||||
if(inByte!=*pos) {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
break;
|
||||
} else if(node&kValueIsFinal) {
|
||||
// No further matching bytes.
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
} else {
|
||||
// Skip intermediate value.
|
||||
pos=skipValue(pos, node);
|
||||
// The next node must not also be a value node.
|
||||
U_ASSERT(*pos<kMinValueLead);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint8_t *
|
||||
ByteTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
|
||||
UBool haveUniqueValue, int32_t &uniqueValue) {
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison byte
|
||||
if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
|
||||
return NULL;
|
||||
}
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
do {
|
||||
++pos; // ignore a comparison byte
|
||||
// handle its value
|
||||
int32_t node=*pos++;
|
||||
UBool isFinal=(UBool)(node&kValueIsFinal);
|
||||
int32_t value=readValue(pos, node>>1);
|
||||
pos=skipValue(pos, node);
|
||||
if(isFinal) {
|
||||
if(haveUniqueValue) {
|
||||
if(value!=uniqueValue) {
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
uniqueValue=value;
|
||||
haveUniqueValue=TRUE;
|
||||
}
|
||||
} else {
|
||||
if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
|
||||
return NULL;
|
||||
}
|
||||
haveUniqueValue=TRUE;
|
||||
}
|
||||
} while(--length>1);
|
||||
return pos+1; // ignore the last comparison byte
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node<kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
|
||||
if(pos==NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
haveUniqueValue=TRUE;
|
||||
} else if(node<kMinValueLead) {
|
||||
// linear-match node
|
||||
pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
|
||||
} else {
|
||||
UBool isFinal=(UBool)(node&kValueIsFinal);
|
||||
int32_t value=readValue(pos, node>>1);
|
||||
if(haveUniqueValue) {
|
||||
if(value!=uniqueValue) {
|
||||
return FALSE;
|
||||
}
|
||||
} else {
|
||||
uniqueValue=value;
|
||||
haveUniqueValue=TRUE;
|
||||
}
|
||||
if(isFinal) {
|
||||
return TRUE;
|
||||
}
|
||||
pos=skipValue(pos, node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrie::getNextBytes(ByteSink &out) const {
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
return 0;
|
||||
}
|
||||
if(remainingMatchLength_>=0) {
|
||||
append(out, *pos); // Next byte of a pending linear-match node.
|
||||
return 1;
|
||||
}
|
||||
int32_t node=*pos++;
|
||||
if(node>=kMinValueLead) {
|
||||
if(node&kValueIsFinal) {
|
||||
return 0;
|
||||
} else {
|
||||
pos=skipValue(pos, node);
|
||||
node=*pos++;
|
||||
U_ASSERT(node<kMinValueLead);
|
||||
}
|
||||
}
|
||||
if(node<kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
getNextBranchBytes(pos, ++node, out);
|
||||
return node;
|
||||
} else {
|
||||
// First byte of the linear-match node.
|
||||
append(out, *pos);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison byte
|
||||
getNextBranchBytes(jumpByDelta(pos), length>>1, out);
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
do {
|
||||
append(out, *pos++);
|
||||
pos=skipValue(pos);
|
||||
} while(--length>1);
|
||||
append(out, *pos);
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrie::append(ByteSink &out, int c) {
|
||||
char ch=(char)c;
|
||||
out.Append(&ch, 1);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
331
icu4c/source/common/bytetrie.h
Normal file
331
icu4c/source/common/bytetrie.h
Normal file
|
@ -0,0 +1,331 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetrie.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010sep25
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __BYTETRIE_H__
|
||||
#define __BYTETRIE_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: Dictionary trie for mapping arbitrary byte sequences
|
||||
* to integer values.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "uassert.h"
|
||||
#include "udicttrie.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class ByteSink;
|
||||
class ByteTrieBuilder;
|
||||
class ByteTrieIterator;
|
||||
|
||||
/**
|
||||
* Light-weight, non-const reader class for a ByteTrie.
|
||||
* Traverses a byte-serialized data structure with minimal state,
|
||||
* for mapping byte sequences to non-negative integer values.
|
||||
*/
|
||||
class U_COMMON_API ByteTrie : public UMemory {
|
||||
public:
|
||||
ByteTrie(const void *trieBytes)
|
||||
: bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
|
||||
pos_(bytes_), remainingMatchLength_(-1) {}
|
||||
|
||||
/**
|
||||
* Resets this trie to its initial state.
|
||||
*/
|
||||
ByteTrie &reset() {
|
||||
pos_=bytes_;
|
||||
remainingMatchLength_=-1;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* ByteTrie state object, for saving a trie's current state
|
||||
* and resetting the trie back to this state later.
|
||||
*/
|
||||
class State : public UMemory {
|
||||
public:
|
||||
State() { bytes=NULL; }
|
||||
private:
|
||||
friend class ByteTrie;
|
||||
|
||||
const uint8_t *bytes;
|
||||
const uint8_t *pos;
|
||||
int32_t remainingMatchLength;
|
||||
};
|
||||
|
||||
/**
|
||||
* Saves the state of this trie.
|
||||
* @see resetToState
|
||||
*/
|
||||
const ByteTrie &saveState(State &state) const {
|
||||
state.bytes=bytes_;
|
||||
state.pos=pos_;
|
||||
state.remainingMatchLength=remainingMatchLength_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets this trie to the saved state.
|
||||
* If the state object contains no state, or the state of a different trie,
|
||||
* then this trie remains unchanged.
|
||||
* @see saveState
|
||||
* @see reset
|
||||
*/
|
||||
ByteTrie &resetToState(const State &state) {
|
||||
if(bytes_==state.bytes && bytes_!=NULL) {
|
||||
pos_=state.pos;
|
||||
remainingMatchLength_=state.remainingMatchLength;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the byte sequence so far matches, whether it has a value,
|
||||
* and whether another input byte can continue a matching byte sequence.
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
UDictTrieResult current() const;
|
||||
|
||||
/**
|
||||
* Traverses the trie from the initial state for this input byte.
|
||||
* Equivalent to reset().next(inByte).
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
inline UDictTrieResult first(int32_t inByte) {
|
||||
remainingMatchLength_=-1;
|
||||
return nextImpl(bytes_, inByte);
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverses the trie from the current state for this input byte.
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
UDictTrieResult next(int32_t inByte);
|
||||
|
||||
/**
|
||||
* Traverses the trie from the current state for this byte sequence.
|
||||
* Equivalent to
|
||||
* \code
|
||||
* Result result=current();
|
||||
* for(each c in s)
|
||||
* if((result=next(c))==UDICTTRIE_NO_MATCH) return UDICTTRIE_NO_MATCH;
|
||||
* return result;
|
||||
* \endcode
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
UDictTrieResult next(const char *s, int32_t length);
|
||||
|
||||
/**
|
||||
* Returns a matching byte sequence's value if called immediately after
|
||||
* current()/first()/next() returned UDICTTRIE_HAS_VALUE or UDICTTRIE_HAS_FINAL_VALUE.
|
||||
* getValue() can be called multiple times.
|
||||
*
|
||||
* Do not call getValue() after UDICTTRIE_NO_MATCH or UDICTTRIE_NO_VALUE!
|
||||
*/
|
||||
inline int32_t getValue() const {
|
||||
const uint8_t *pos=pos_;
|
||||
int32_t leadByte=*pos++;
|
||||
U_ASSERT(leadByte>=kMinValueLead);
|
||||
return readValue(pos, leadByte>>1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether all byte sequences reachable from the current state
|
||||
* map to the same value.
|
||||
* @param uniqueValue Receives the unique value, if this function returns TRUE.
|
||||
* (output-only)
|
||||
* @return TRUE if all byte sequences reachable from the current state
|
||||
* map to the same value.
|
||||
*/
|
||||
inline UBool hasUniqueValue(int32_t &uniqueValue) const {
|
||||
const uint8_t *pos=pos_;
|
||||
// Skip the rest of a pending linear-match node.
|
||||
return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds each byte which continues the byte sequence from the current state.
|
||||
* That is, each byte b for which it would be next(b)!=UDICTTRIE_NO_MATCH now.
|
||||
* @param out Each next byte is appended to this object.
|
||||
* (Only uses the out.Append(s, length) method.)
|
||||
* @return the number of bytes which continue the byte sequence from here
|
||||
*/
|
||||
int32_t getNextBytes(ByteSink &out) const;
|
||||
|
||||
private:
|
||||
friend class ByteTrieBuilder;
|
||||
friend class ByteTrieIterator;
|
||||
|
||||
inline void stop() {
|
||||
pos_=NULL;
|
||||
}
|
||||
|
||||
// Reads a compact 32-bit integer.
|
||||
// pos is already after the leadByte, and the lead byte is already shifted right by 1.
|
||||
static int32_t readValue(const uint8_t *pos, int32_t leadByte);
|
||||
static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
|
||||
U_ASSERT(leadByte>=kMinValueLead);
|
||||
if(leadByte>=(kMinTwoByteValueLead<<1)) {
|
||||
if(leadByte<(kMinThreeByteValueLead<<1)) {
|
||||
++pos;
|
||||
} else if(leadByte<(kFourByteValueLead<<1)) {
|
||||
pos+=2;
|
||||
} else {
|
||||
pos+=3+((leadByte>>1)&1);
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
static inline const uint8_t *skipValue(const uint8_t *pos) {
|
||||
int32_t leadByte=*pos++;
|
||||
return skipValue(pos, leadByte);
|
||||
}
|
||||
|
||||
// Reads a jump delta and jumps.
|
||||
static const uint8_t *jumpByDelta(const uint8_t *pos);
|
||||
|
||||
static inline const uint8_t *skipDelta(const uint8_t *pos) {
|
||||
int32_t delta=*pos++;
|
||||
if(delta>=kMinTwoByteDeltaLead) {
|
||||
if(delta<kMinThreeByteDeltaLead) {
|
||||
++pos;
|
||||
} else if(delta<kFourByteDeltaLead) {
|
||||
pos+=2;
|
||||
} else {
|
||||
pos+=3+(delta&1);
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline UDictTrieResult valueResult(int32_t node) {
|
||||
return (UDictTrieResult)(UDICTTRIE_HAS_VALUE-(node&kValueIsFinal));
|
||||
}
|
||||
|
||||
// Handles a branch node for both next(byte) and next(string).
|
||||
UDictTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
|
||||
|
||||
// Requires remainingLength_<0.
|
||||
UDictTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
|
||||
|
||||
// Helper functions for hasUniqueValue().
|
||||
// Recursively finds a unique value (or whether there is not a unique one)
|
||||
// from a branch.
|
||||
static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
|
||||
UBool haveUniqueValue, int32_t &uniqueValue);
|
||||
// Recursively finds a unique value (or whether there is not a unique one)
|
||||
// starting from a position on a node lead byte.
|
||||
static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
|
||||
|
||||
// Helper functions for getNextBytes().
|
||||
// getNextBytes() when pos is on a branch node.
|
||||
static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
|
||||
static void append(ByteSink &out, int c);
|
||||
|
||||
// ByteTrie data structure
|
||||
//
|
||||
// The trie consists of a series of byte-serialized nodes for incremental
|
||||
// string/byte sequence matching. The root node is at the beginning of the trie data.
|
||||
//
|
||||
// Types of nodes are distinguished by their node lead byte ranges.
|
||||
// After each node, except a final-value node, another node follows to
|
||||
// encode match values or continue matching further bytes.
|
||||
//
|
||||
// Node types:
|
||||
// - Value node: Stores a 32-bit integer in a compact, variable-length format.
|
||||
// The value is for the string/byte sequence so far.
|
||||
// One node bit indicates whether the value is final or whether
|
||||
// matching continues with the next node.
|
||||
// - Linear-match node: Matches a number of bytes.
|
||||
// - Branch node: Branches to other nodes according to the current input byte.
|
||||
// The node byte is the length of the branch (number of bytes to select from)
|
||||
// minus 1. It is followed by a sub-node:
|
||||
// - If the length is at most kMaxBranchLinearSubNodeLength, then
|
||||
// there are length-1 (key, value) pairs and then one more comparison byte.
|
||||
// If one of the key bytes matches, then the value is either a final value for
|
||||
// the string/byte sequence so far, or a "jump" delta to the next node.
|
||||
// If the last byte matches, then matching continues with the next node.
|
||||
// (Values have the same encoding as value nodes.)
|
||||
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
|
||||
// there is one byte and one "jump" delta.
|
||||
// If the input byte is less than the sub-node byte, then "jump" by delta to
|
||||
// the next sub-node which will have a length of length/2.
|
||||
// (The delta has its own compact encoding.)
|
||||
// Otherwise, skip the "jump" delta to the next sub-node
|
||||
// which will have a length of length-length/2.
|
||||
|
||||
// Node lead byte values.
|
||||
|
||||
// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
|
||||
// the length is one more than the next byte.
|
||||
|
||||
// For a branch sub-node with at most this many entries, we drop down
|
||||
// to a linear search.
|
||||
static const int32_t kMaxBranchLinearSubNodeLength=5;
|
||||
|
||||
// 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
|
||||
static const int32_t kMinLinearMatch=0x10;
|
||||
static const int32_t kMaxLinearMatchLength=0x10;
|
||||
|
||||
// 20..ff: Variable-length value node.
|
||||
// If odd, the value is final. (Otherwise, intermediate value or jump delta.)
|
||||
// Then shift-right by 1 bit.
|
||||
// The remaining lead byte value indicates the number of following bytes (0..4)
|
||||
// and contains the value's top bits.
|
||||
static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
|
||||
// It is a final value if bit 0 is set.
|
||||
static const int32_t kValueIsFinal=1;
|
||||
|
||||
// Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
|
||||
static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10
|
||||
static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
|
||||
|
||||
static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
|
||||
static const int32_t kMaxTwoByteValue=0x1aff;
|
||||
|
||||
static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
|
||||
static const int32_t kFourByteValueLead=0x7e;
|
||||
|
||||
// A little more than Unicode code points. (0x11ffff)
|
||||
static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
|
||||
|
||||
static const int32_t kFiveByteValueLead=0x7f;
|
||||
|
||||
// Compact delta integers.
|
||||
static const int32_t kMaxOneByteDelta=0xbf;
|
||||
static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
|
||||
static const int32_t kMinThreeByteDeltaLead=0xf0;
|
||||
static const int32_t kFourByteDeltaLead=0xfe;
|
||||
static const int32_t kFiveByteDeltaLead=0xff;
|
||||
|
||||
static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
|
||||
static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
|
||||
|
||||
// Fixed value referencing the ByteTrie bytes.
|
||||
const uint8_t *bytes_;
|
||||
|
||||
// Iterator variables.
|
||||
|
||||
// Pointer to next trie byte to read. NULL if no more matches.
|
||||
const uint8_t *pos_;
|
||||
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
|
||||
int32_t remainingMatchLength_;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __BYTETRIE_H__
|
|
@ -400,6 +400,7 @@
|
|||
<ClCompile Include="servslkf.cpp" />
|
||||
<ClCompile Include="usprep.cpp" />
|
||||
<ClCompile Include="bytestream.cpp" />
|
||||
<ClCompile Include="bytetrie.cpp" />
|
||||
<ClCompile Include="chariter.cpp" />
|
||||
<ClCompile Include="charstr.cpp" />
|
||||
<ClCompile Include="cstring.c" />
|
||||
|
@ -1365,6 +1366,7 @@
|
|||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
<ClInclude Include="bytetrie.h" />
|
||||
<CustomBuild Include="unicode\chariter.h">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
|
@ -1608,4 +1610,4 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2009, International Business Machines
|
||||
* Copyright (c) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
* Created: October 30 2002
|
||||
* Since: ICU 2.4
|
||||
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
|
||||
**********************************************************************
|
||||
*/
|
||||
#include "propname.h"
|
||||
|
@ -16,6 +17,10 @@
|
|||
#include "cstring.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "uarrsort.h"
|
||||
#include "uinvchar.h"
|
||||
|
||||
#define INCLUDED_FROM_PROPNAME_CPP
|
||||
#include "propname_data.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
|
@ -94,7 +99,7 @@ uprv_compareASCIIPropertyNames(const char *name1, const char *name2) {
|
|||
if(((r1|r2)&0xff)==0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Compare the lowercased characters */
|
||||
if(r1!=r2) {
|
||||
rc=(r1&0xff)-(r2&0xff);
|
||||
|
@ -120,7 +125,7 @@ uprv_compareEBCDICPropertyNames(const char *name1, const char *name2) {
|
|||
if(((r1|r2)&0xff)==0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Compare the lowercased characters */
|
||||
if(r1!=r2) {
|
||||
rc=(r1&0xff)-(r2&0xff);
|
||||
|
@ -138,615 +143,169 @@ U_CDECL_END
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// PropertyAliases implementation
|
||||
int32_t PropNameData::findProperty(int32_t property) {
|
||||
int32_t i=1; // valueMaps index, initially after numRanges
|
||||
for(int32_t numRanges=valueMaps[0]; numRanges>0; --numRanges) {
|
||||
// Read and skip the start and limit of this range.
|
||||
int32_t start=valueMaps[i];
|
||||
int32_t limit=valueMaps[i+1];
|
||||
i+=2;
|
||||
if(property<start) {
|
||||
break;
|
||||
}
|
||||
if(property<limit) {
|
||||
return i+(property-start)*2;
|
||||
}
|
||||
i+=(limit-start)*2; // Skip all entries for this range.
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char*
|
||||
PropertyAliases::chooseNameInGroup(Offset offset,
|
||||
UPropertyNameChoice choice) const {
|
||||
int32_t c = choice;
|
||||
if (!offset || c < 0) {
|
||||
int32_t PropNameData::findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value) {
|
||||
if(valueMapIndex==0) {
|
||||
return 0; // The property does not have named values.
|
||||
}
|
||||
++valueMapIndex; // Skip the ByteTrie offset.
|
||||
int32_t numRanges=valueMaps[valueMapIndex++];
|
||||
if(numRanges<0x10) {
|
||||
// Ranges of values.
|
||||
for(; numRanges>0; --numRanges) {
|
||||
// Read and skip the start and limit of this range.
|
||||
int32_t start=valueMaps[valueMapIndex];
|
||||
int32_t limit=valueMaps[valueMapIndex+1];
|
||||
valueMapIndex+=2;
|
||||
if(value<start) {
|
||||
break;
|
||||
}
|
||||
if(value<limit) {
|
||||
return valueMaps[valueMapIndex+value-start];
|
||||
}
|
||||
valueMapIndex+=limit-start; // Skip all entries for this range.
|
||||
}
|
||||
} else {
|
||||
// List of values.
|
||||
int32_t valuesStart=valueMapIndex;
|
||||
int32_t nameGroupOffsetsStart=valueMapIndex+numRanges-0x10;
|
||||
do {
|
||||
int32_t v=valueMaps[valueMapIndex];
|
||||
if(value<v) {
|
||||
break;
|
||||
}
|
||||
if(value==v) {
|
||||
return valueMaps[nameGroupOffsetsStart+valueMapIndex-valuesStart];
|
||||
}
|
||||
} while(++valueMapIndex<nameGroupOffsetsStart);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *PropNameData::getName(const char *nameGroup, int32_t nameIndex) {
|
||||
int32_t numNames=*nameGroup++;
|
||||
if(nameIndex<0 || numNames<=nameIndex) {
|
||||
return NULL;
|
||||
}
|
||||
const Offset* p = (const Offset*) getPointer(offset);
|
||||
while (c-- > 0) {
|
||||
if (*p++ < 0) return NULL;
|
||||
// Skip nameIndex names.
|
||||
for(; nameIndex>0; --nameIndex) {
|
||||
nameGroup=uprv_strchr(nameGroup, 0)+1;
|
||||
}
|
||||
Offset a = *p;
|
||||
if (a < 0) a = -a;
|
||||
return (const char*) getPointerNull(a);
|
||||
}
|
||||
|
||||
const ValueMap*
|
||||
PropertyAliases::getValueMap(EnumValue prop) const {
|
||||
NonContiguousEnumToOffset* e2o = (NonContiguousEnumToOffset*) getPointer(enumToValue_offset);
|
||||
Offset a = e2o->getOffset(prop);
|
||||
return (const ValueMap*) (a ? getPointerNull(a) : NULL);
|
||||
}
|
||||
|
||||
inline const char*
|
||||
PropertyAliases::getPropertyName(EnumValue prop,
|
||||
UPropertyNameChoice choice) const {
|
||||
NonContiguousEnumToOffset* e2n = (NonContiguousEnumToOffset*) getPointer(enumToName_offset);
|
||||
return chooseNameInGroup(e2n->getOffset(prop), choice);
|
||||
}
|
||||
|
||||
inline EnumValue
|
||||
PropertyAliases::getPropertyEnum(const char* alias) const {
|
||||
NameToEnum* n2e = (NameToEnum*) getPointer(nameToEnum_offset);
|
||||
return n2e->getEnum(alias, *this);
|
||||
}
|
||||
|
||||
inline const char*
|
||||
PropertyAliases::getPropertyValueName(EnumValue prop,
|
||||
EnumValue value,
|
||||
UPropertyNameChoice choice) const {
|
||||
const ValueMap* vm = getValueMap(prop);
|
||||
if (!vm) return NULL;
|
||||
Offset a;
|
||||
if (vm->enumToName_offset) {
|
||||
a = ((EnumToOffset*) getPointer(vm->enumToName_offset))->
|
||||
getOffset(value);
|
||||
} else {
|
||||
a = ((NonContiguousEnumToOffset*) getPointer(vm->ncEnumToName_offset))->
|
||||
getOffset(value);
|
||||
if(*nameGroup==0) {
|
||||
return NULL; // no name (Property[Value]Aliases.txt has "n/a")
|
||||
}
|
||||
return chooseNameInGroup(a, choice);
|
||||
return nameGroup;
|
||||
}
|
||||
|
||||
inline EnumValue
|
||||
PropertyAliases::getPropertyValueEnum(EnumValue prop,
|
||||
const char* alias) const {
|
||||
const ValueMap* vm = getValueMap(prop);
|
||||
if (!vm) return UCHAR_INVALID_CODE;
|
||||
NameToEnum* n2e = (NameToEnum*) getPointer(vm->nameToEnum_offset);
|
||||
return n2e->getEnum(alias, *this);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
U_NAMESPACE_USE
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// UDataMemory structures
|
||||
|
||||
static const PropertyAliases* PNAME = NULL;
|
||||
static UDataMemory* UDATA = NULL;
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// UDataMemory loading/unloading
|
||||
|
||||
/**
|
||||
* udata callback to verify the zone data.
|
||||
*/
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
isPNameAcceptable(void* /*context*/,
|
||||
const char* /*type*/, const char* /*name*/,
|
||||
const UDataInfo* info) {
|
||||
return
|
||||
info->size >= sizeof(UDataInfo) &&
|
||||
info->isBigEndian == U_IS_BIG_ENDIAN &&
|
||||
info->charsetFamily == U_CHARSET_FAMILY &&
|
||||
info->dataFormat[0] == PNAME_SIG_0 &&
|
||||
info->dataFormat[1] == PNAME_SIG_1 &&
|
||||
info->dataFormat[2] == PNAME_SIG_2 &&
|
||||
info->dataFormat[3] == PNAME_SIG_3 &&
|
||||
info->formatVersion[0] == PNAME_FORMAT_VERSION;
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV pname_cleanup(void) {
|
||||
if (UDATA) {
|
||||
udata_close(UDATA);
|
||||
UDATA = NULL;
|
||||
UBool PropNameData::containsName(ByteTrie &trie, const char *name) {
|
||||
if(name==NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
PNAME = NULL;
|
||||
return TRUE;
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
/**
|
||||
* Load the property names data. Caller should check that data is
|
||||
* not loaded BEFORE calling this function. Returns TRUE if the load
|
||||
* succeeds.
|
||||
*/
|
||||
static UBool _load() {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
UDataMemory* data =
|
||||
udata_openChoice(0, PNAME_DATA_TYPE, PNAME_DATA_NAME,
|
||||
isPNameAcceptable, 0, &ec);
|
||||
if (U_SUCCESS(ec)) {
|
||||
umtx_lock(NULL);
|
||||
if (UDATA == NULL) {
|
||||
UDATA = data;
|
||||
PNAME = (const PropertyAliases*) udata_getMemory(UDATA);
|
||||
ucln_common_registerCleanup(UCLN_COMMON_PNAME, pname_cleanup);
|
||||
data = NULL;
|
||||
UDictTrieResult result=UDICTTRIE_NO_VALUE;
|
||||
char c;
|
||||
while((c=*name++)!=0) {
|
||||
c=uprv_invCharToLowercaseAscii(c);
|
||||
// Ignore delimiters '-', '_', and ASCII White_Space.
|
||||
if(c==0x2d || c==0x5f || c==0x20 || (0x09<=c && c<=0x0d)) {
|
||||
continue;
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
if(!UDICTTRIE_RESULT_HAS_NEXT(result)) {
|
||||
return FALSE;
|
||||
}
|
||||
result=trie.next((uint8_t)c);
|
||||
}
|
||||
if (data) {
|
||||
udata_close(data);
|
||||
}
|
||||
return PNAME!=NULL;
|
||||
return UDICTTRIE_RESULT_HAS_VALUE(result);
|
||||
}
|
||||
|
||||
/**
|
||||
* Inline function that expands to code that does a lazy load of the
|
||||
* property names data. If the data is already loaded, avoids an
|
||||
* unnecessary function call. If the data is not loaded, call _load()
|
||||
* to load it, and return TRUE if the load succeeds.
|
||||
*/
|
||||
static inline UBool load() {
|
||||
UBool f;
|
||||
UMTX_CHECK(NULL, (PNAME!=NULL), f);
|
||||
return f || _load();
|
||||
const char *PropNameData::getPropertyName(int32_t property, int32_t nameChoice) {
|
||||
int32_t valueMapIndex=findProperty(property);
|
||||
if(valueMapIndex==0) {
|
||||
return NULL; // Not a known property.
|
||||
}
|
||||
return getName(nameGroups+valueMaps[valueMapIndex], nameChoice);
|
||||
}
|
||||
|
||||
const char *PropNameData::getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice) {
|
||||
int32_t valueMapIndex=findProperty(property);
|
||||
if(valueMapIndex==0) {
|
||||
return NULL; // Not a known property.
|
||||
}
|
||||
int32_t nameGroupOffset=findPropertyValueNameGroup(valueMaps[valueMapIndex+1], value);
|
||||
if(nameGroupOffset==0) {
|
||||
return NULL;
|
||||
}
|
||||
return getName(nameGroups+nameGroupOffset, nameChoice);
|
||||
}
|
||||
|
||||
int32_t PropNameData::getPropertyOrValueEnum(int32_t byteTrieOffset, const char *alias) {
|
||||
ByteTrie trie(byteTries+byteTrieOffset);
|
||||
if(containsName(trie, alias)) {
|
||||
return trie.getValue();
|
||||
} else {
|
||||
return UCHAR_INVALID_CODE;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t PropNameData::getPropertyEnum(const char *alias) {
|
||||
return getPropertyOrValueEnum(0, alias);
|
||||
}
|
||||
|
||||
int32_t PropNameData::getPropertyValueEnum(int32_t property, const char *alias) {
|
||||
int32_t valueMapIndex=findProperty(property);
|
||||
if(valueMapIndex==0) {
|
||||
return UCHAR_INVALID_CODE; // Not a known property.
|
||||
}
|
||||
valueMapIndex=valueMaps[valueMapIndex+1];
|
||||
if(valueMapIndex==0) {
|
||||
return UCHAR_INVALID_CODE; // The property does not have named values.
|
||||
}
|
||||
// valueMapIndex is the start of the property's valueMap,
|
||||
// where the first word is the ByteTrie offset.
|
||||
return getPropertyOrValueEnum(valueMaps[valueMapIndex], alias);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Public API implementation
|
||||
|
||||
// The C API is just a thin wrapper. Each function obtains a pointer
|
||||
// to the singleton PropertyAliases, and calls the appropriate method
|
||||
// on it. If it cannot obtain a pointer, because valid data is not
|
||||
// available, then it returns NULL or UCHAR_INVALID_CODE.
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
u_getPropertyName(UProperty property,
|
||||
UPropertyNameChoice nameChoice) {
|
||||
return load() ? PNAME->getPropertyName(property, nameChoice)
|
||||
: NULL;
|
||||
return PropNameData::getPropertyName(property, nameChoice);
|
||||
}
|
||||
|
||||
U_CAPI UProperty U_EXPORT2
|
||||
u_getPropertyEnum(const char* alias) {
|
||||
UProperty p = load() ? (UProperty) PNAME->getPropertyEnum(alias)
|
||||
: UCHAR_INVALID_CODE;
|
||||
return p;
|
||||
return (UProperty)PropNameData::getPropertyEnum(alias);
|
||||
}
|
||||
|
||||
U_CAPI const char* U_EXPORT2
|
||||
u_getPropertyValueName(UProperty property,
|
||||
int32_t value,
|
||||
UPropertyNameChoice nameChoice) {
|
||||
return load() ? PNAME->getPropertyValueName(property, value, nameChoice)
|
||||
: NULL;
|
||||
return PropNameData::getPropertyValueName(property, value, nameChoice);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_getPropertyValueEnum(UProperty property,
|
||||
const char* alias) {
|
||||
return load() ? PNAME->getPropertyValueEnum(property, alias)
|
||||
: (int32_t)UCHAR_INVALID_CODE;
|
||||
return PropNameData::getPropertyValueEnum(property, alias);
|
||||
}
|
||||
|
||||
/* data swapping ------------------------------------------------------------ */
|
||||
|
||||
/*
|
||||
* Sub-structure-swappers use the temp array (which is as large as the
|
||||
* actual data) for intermediate storage,
|
||||
* as well as to indicate if a particular structure has been swapped already.
|
||||
* The temp array is initially reset to all 0.
|
||||
* pos is the byte offset of the sub-structure in the inBytes/outBytes/temp arrays.
|
||||
*/
|
||||
|
||||
int32_t
|
||||
EnumToOffset::swap(const UDataSwapper *ds,
|
||||
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
|
||||
uint8_t *temp, int32_t pos,
|
||||
UErrorCode *pErrorCode) {
|
||||
const EnumToOffset *inMap;
|
||||
EnumToOffset *outMap, *tempMap;
|
||||
int32_t size;
|
||||
|
||||
tempMap=(EnumToOffset *)(temp+pos);
|
||||
if(tempMap->enumStart!=0 || tempMap->enumLimit!=0) {
|
||||
/* this map was swapped already */
|
||||
size=tempMap->getSize();
|
||||
return size;
|
||||
}
|
||||
|
||||
inMap=(const EnumToOffset *)(inBytes+pos);
|
||||
outMap=(EnumToOffset *)(outBytes+pos);
|
||||
|
||||
tempMap->enumStart=udata_readInt32(ds, inMap->enumStart);
|
||||
tempMap->enumLimit=udata_readInt32(ds, inMap->enumLimit);
|
||||
size=tempMap->getSize();
|
||||
|
||||
if(length>=0) {
|
||||
if(length<(pos+size)) {
|
||||
if(length<(int32_t)sizeof(PropertyAliases)) {
|
||||
udata_printError(ds, "upname_swap(EnumToOffset): too few bytes (%d after header)\n"
|
||||
" for pnames.icu EnumToOffset{%d..%d} at %d\n",
|
||||
length, tempMap->enumStart, tempMap->enumLimit, pos);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* swap enumStart and enumLimit */
|
||||
ds->swapArray32(ds, inMap, 2*sizeof(EnumValue), outMap, pErrorCode);
|
||||
|
||||
/* swap _offsetArray[] */
|
||||
ds->swapArray16(ds, inMap->getOffsetArray(), (tempMap->enumLimit-tempMap->enumStart)*sizeof(Offset),
|
||||
outMap->getOffsetArray(), pErrorCode);
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
int32_t
|
||||
NonContiguousEnumToOffset::swap(const UDataSwapper *ds,
|
||||
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
|
||||
uint8_t *temp, int32_t pos,
|
||||
UErrorCode *pErrorCode) {
|
||||
const NonContiguousEnumToOffset *inMap;
|
||||
NonContiguousEnumToOffset *outMap, *tempMap;
|
||||
int32_t size;
|
||||
|
||||
tempMap=(NonContiguousEnumToOffset *)(temp+pos);
|
||||
if(tempMap->count!=0) {
|
||||
/* this map was swapped already */
|
||||
size=tempMap->getSize();
|
||||
return size;
|
||||
}
|
||||
|
||||
inMap=(const NonContiguousEnumToOffset *)(inBytes+pos);
|
||||
outMap=(NonContiguousEnumToOffset *)(outBytes+pos);
|
||||
|
||||
tempMap->count=udata_readInt32(ds, inMap->count);
|
||||
size=tempMap->getSize();
|
||||
|
||||
if(length>=0) {
|
||||
if(length<(pos+size)) {
|
||||
if(length<(int32_t)sizeof(PropertyAliases)) {
|
||||
udata_printError(ds, "upname_swap(NonContiguousEnumToOffset): too few bytes (%d after header)\n"
|
||||
" for pnames.icu NonContiguousEnumToOffset[%d] at %d\n",
|
||||
length, tempMap->count, pos);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* swap count and _enumArray[] */
|
||||
length=(1+tempMap->count)*sizeof(EnumValue);
|
||||
ds->swapArray32(ds, inMap, length,
|
||||
outMap, pErrorCode);
|
||||
|
||||
/* swap _offsetArray[] */
|
||||
pos+=length;
|
||||
ds->swapArray16(ds, inBytes+pos, tempMap->count*sizeof(Offset),
|
||||
outBytes+pos, pErrorCode);
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
struct NameAndIndex {
|
||||
Offset name, index;
|
||||
};
|
||||
|
||||
U_CDECL_BEGIN
|
||||
typedef int32_t U_CALLCONV PropNameCompareFn(const char *name1, const char *name2);
|
||||
|
||||
struct CompareContext {
|
||||
const char *chars;
|
||||
PropNameCompareFn *propCompare;
|
||||
};
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
upname_compareRows(const void *context, const void *left, const void *right) {
|
||||
CompareContext *cmp=(CompareContext *)context;
|
||||
return cmp->propCompare(cmp->chars+((const NameAndIndex *)left)->name,
|
||||
cmp->chars+((const NameAndIndex *)right)->name);
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
int32_t
|
||||
NameToEnum::swap(const UDataSwapper *ds,
|
||||
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
|
||||
uint8_t *temp, int32_t pos,
|
||||
UErrorCode *pErrorCode) {
|
||||
const NameToEnum *inMap;
|
||||
NameToEnum *outMap, *tempMap;
|
||||
|
||||
const EnumValue *inEnumArray;
|
||||
EnumValue *outEnumArray;
|
||||
|
||||
const Offset *inNameArray;
|
||||
Offset *outNameArray;
|
||||
|
||||
NameAndIndex *sortArray;
|
||||
CompareContext cmp;
|
||||
|
||||
int32_t i, size, oldIndex;
|
||||
|
||||
tempMap=(NameToEnum *)(temp+pos);
|
||||
if(tempMap->count!=0) {
|
||||
/* this map was swapped already */
|
||||
size=tempMap->getSize();
|
||||
return size;
|
||||
}
|
||||
|
||||
inMap=(const NameToEnum *)(inBytes+pos);
|
||||
outMap=(NameToEnum *)(outBytes+pos);
|
||||
|
||||
tempMap->count=udata_readInt32(ds, inMap->count);
|
||||
size=tempMap->getSize();
|
||||
|
||||
if(length>=0) {
|
||||
if(length<(pos+size)) {
|
||||
if(length<(int32_t)sizeof(PropertyAliases)) {
|
||||
udata_printError(ds, "upname_swap(NameToEnum): too few bytes (%d after header)\n"
|
||||
" for pnames.icu NameToEnum[%d] at %d\n",
|
||||
length, tempMap->count, pos);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* swap count */
|
||||
ds->swapArray32(ds, inMap, 4, outMap, pErrorCode);
|
||||
|
||||
inEnumArray=inMap->getEnumArray();
|
||||
outEnumArray=outMap->getEnumArray();
|
||||
|
||||
inNameArray=(const Offset *)(inEnumArray+tempMap->count);
|
||||
outNameArray=(Offset *)(outEnumArray+tempMap->count);
|
||||
|
||||
if(ds->inCharset==ds->outCharset) {
|
||||
/* no need to sort, just swap the enum/name arrays */
|
||||
ds->swapArray32(ds, inEnumArray, tempMap->count*4, outEnumArray, pErrorCode);
|
||||
ds->swapArray16(ds, inNameArray, tempMap->count*2, outNameArray, pErrorCode);
|
||||
return size;
|
||||
}
|
||||
|
||||
/*
|
||||
* The name and enum arrays are sorted by names and must be resorted
|
||||
* if inCharset!=outCharset.
|
||||
* We use the corresponding part of the temp array to sort an array
|
||||
* of pairs of name offsets and sorting indexes.
|
||||
* Then the sorting indexes are used to permutate-swap the name and enum arrays.
|
||||
*
|
||||
* The outBytes must already contain the swapped strings.
|
||||
*/
|
||||
sortArray=(NameAndIndex *)tempMap->getEnumArray();
|
||||
for(i=0; i<tempMap->count; ++i) {
|
||||
sortArray[i].name=udata_readInt16(ds, inNameArray[i]);
|
||||
sortArray[i].index=(Offset)i;
|
||||
}
|
||||
|
||||
/*
|
||||
* use a stable sort to avoid shuffling of equal strings,
|
||||
* which makes testing harder
|
||||
*/
|
||||
cmp.chars=(const char *)outBytes;
|
||||
if (ds->outCharset==U_ASCII_FAMILY) {
|
||||
cmp.propCompare=uprv_compareASCIIPropertyNames;
|
||||
}
|
||||
else {
|
||||
cmp.propCompare=uprv_compareEBCDICPropertyNames;
|
||||
}
|
||||
uprv_sortArray(sortArray, tempMap->count, sizeof(NameAndIndex),
|
||||
upname_compareRows, &cmp,
|
||||
TRUE, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
udata_printError(ds, "upname_swap(NameToEnum).uprv_sortArray(%d items) failed\n",
|
||||
tempMap->count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* copy/swap/permutate _enumArray[] and _nameArray[] */
|
||||
if(inEnumArray!=outEnumArray) {
|
||||
for(i=0; i<tempMap->count; ++i) {
|
||||
oldIndex=sortArray[i].index;
|
||||
ds->swapArray32(ds, inEnumArray+oldIndex, 4, outEnumArray+i, pErrorCode);
|
||||
ds->swapArray16(ds, inNameArray+oldIndex, 2, outNameArray+i, pErrorCode);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* in-place swapping: need to permutate into a temporary array
|
||||
* and then copy back to not destroy the data
|
||||
*/
|
||||
EnumValue *tempEnumArray;
|
||||
Offset *oldIndexes;
|
||||
|
||||
/* write name offsets directly from sortArray */
|
||||
for(i=0; i<tempMap->count; ++i) {
|
||||
ds->writeUInt16((uint16_t *)outNameArray+i, (uint16_t)sortArray[i].name);
|
||||
}
|
||||
|
||||
/*
|
||||
* compress the oldIndexes into a separate array to make space for tempEnumArray
|
||||
* the tempMap _nameArray becomes oldIndexes[], getting the index
|
||||
* values from the 2D sortArray[],
|
||||
* while sortArray=tempMap _enumArray[] becomes tempEnumArray[]
|
||||
* this saves us allocating more memory
|
||||
*
|
||||
* it works because sizeof(NameAndIndex)<=sizeof(EnumValue)
|
||||
* and because the nameArray[] can be used for oldIndexes[]
|
||||
*/
|
||||
tempEnumArray=(EnumValue *)sortArray;
|
||||
oldIndexes=(Offset *)(sortArray+tempMap->count);
|
||||
|
||||
/* copy sortArray[].index values into oldIndexes[] */
|
||||
for(i=0; i<tempMap->count; ++i) {
|
||||
oldIndexes[i]=sortArray[i].index;
|
||||
}
|
||||
|
||||
/* permutate inEnumArray[] into tempEnumArray[] */
|
||||
for(i=0; i<tempMap->count; ++i) {
|
||||
ds->swapArray32(ds, inEnumArray+oldIndexes[i], 4, tempEnumArray+i, pErrorCode);
|
||||
}
|
||||
|
||||
/* copy tempEnumArray[] to outEnumArray[] */
|
||||
uprv_memcpy(outEnumArray, tempEnumArray, tempMap->count*4);
|
||||
}
|
||||
}
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
int32_t
|
||||
PropertyAliases::swap(const UDataSwapper *ds,
|
||||
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
|
||||
UErrorCode *pErrorCode) {
|
||||
const PropertyAliases *inAliases;
|
||||
PropertyAliases *outAliases;
|
||||
PropertyAliases aliases;
|
||||
|
||||
const ValueMap *inValueMaps;
|
||||
ValueMap *outValueMaps;
|
||||
ValueMap valueMap;
|
||||
|
||||
int32_t i;
|
||||
|
||||
inAliases=(const PropertyAliases *)inBytes;
|
||||
outAliases=(PropertyAliases *)outBytes;
|
||||
|
||||
/* read the input PropertyAliases - all 16-bit values */
|
||||
for(i=0; i<(int32_t)sizeof(PropertyAliases)/2; ++i) {
|
||||
((uint16_t *)&aliases)[i]=ds->readUInt16(((const uint16_t *)inBytes)[i]);
|
||||
}
|
||||
|
||||
if(length>=0) {
|
||||
if(length<aliases.total_size) {
|
||||
udata_printError(ds, "upname_swap(): too few bytes (%d after header) for all of pnames.icu\n",
|
||||
length);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* copy the data for inaccessible bytes */
|
||||
if(inBytes!=outBytes) {
|
||||
uprv_memcpy(outBytes, inBytes, aliases.total_size);
|
||||
}
|
||||
|
||||
/* swap the PropertyAliases class fields */
|
||||
ds->swapArray16(ds, inAliases, sizeof(PropertyAliases), outAliases, pErrorCode);
|
||||
|
||||
/* swap the name groups */
|
||||
ds->swapArray16(ds, inBytes+aliases.nameGroupPool_offset,
|
||||
aliases.stringPool_offset-aliases.nameGroupPool_offset,
|
||||
outBytes+aliases.nameGroupPool_offset, pErrorCode);
|
||||
|
||||
/* swap the strings */
|
||||
udata_swapInvStringBlock(ds, inBytes+aliases.stringPool_offset,
|
||||
aliases.total_size-aliases.stringPool_offset,
|
||||
outBytes+aliases.stringPool_offset, pErrorCode);
|
||||
|
||||
/*
|
||||
* alloc uint8_t temp[total_size] and reset it
|
||||
* swap each top-level struct, put at least the count fields into temp
|
||||
* use subclass-specific swap() functions
|
||||
* enumerate value maps, for each
|
||||
* if temp does not have count!=0 yet
|
||||
* read count, put it into temp
|
||||
* swap the array(s)
|
||||
* resort strings in name->enum maps
|
||||
* swap value maps
|
||||
*/
|
||||
LocalMemory<uint8_t> temp;
|
||||
if(temp.allocateInsteadAndReset(aliases.total_size)==NULL) {
|
||||
udata_printError(ds, "upname_swap(): unable to allocate temp memory (%d bytes)\n",
|
||||
aliases.total_size);
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* swap properties->name groups map */
|
||||
NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
|
||||
temp.getAlias(), aliases.enumToName_offset, pErrorCode);
|
||||
|
||||
/* swap name->properties map */
|
||||
NameToEnum::swap(ds, inBytes, length, outBytes,
|
||||
temp.getAlias(), aliases.nameToEnum_offset, pErrorCode);
|
||||
|
||||
/* swap properties->value maps map */
|
||||
NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
|
||||
temp.getAlias(), aliases.enumToValue_offset, pErrorCode);
|
||||
|
||||
/* enumerate all ValueMaps and swap them */
|
||||
inValueMaps=(const ValueMap *)(inBytes+aliases.valueMap_offset);
|
||||
outValueMaps=(ValueMap *)(outBytes+aliases.valueMap_offset);
|
||||
|
||||
for(i=0; i<aliases.valueMap_count; ++i) {
|
||||
valueMap.enumToName_offset=udata_readInt16(ds, inValueMaps[i].enumToName_offset);
|
||||
valueMap.ncEnumToName_offset=udata_readInt16(ds, inValueMaps[i].ncEnumToName_offset);
|
||||
valueMap.nameToEnum_offset=udata_readInt16(ds, inValueMaps[i].nameToEnum_offset);
|
||||
|
||||
if(valueMap.enumToName_offset!=0) {
|
||||
EnumToOffset::swap(ds, inBytes, length, outBytes,
|
||||
temp.getAlias(), valueMap.enumToName_offset,
|
||||
pErrorCode);
|
||||
} else if(valueMap.ncEnumToName_offset!=0) {
|
||||
NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
|
||||
temp.getAlias(), valueMap.ncEnumToName_offset,
|
||||
pErrorCode);
|
||||
}
|
||||
if(valueMap.nameToEnum_offset!=0) {
|
||||
NameToEnum::swap(ds, inBytes, length, outBytes,
|
||||
temp.getAlias(), valueMap.nameToEnum_offset,
|
||||
pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
/* swap the ValueMaps array itself */
|
||||
ds->swapArray16(ds, inValueMaps, aliases.valueMap_count*sizeof(ValueMap),
|
||||
outValueMaps, pErrorCode);
|
||||
|
||||
/* name groups and strings were swapped above */
|
||||
}
|
||||
|
||||
return aliases.total_size;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
upname_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UDataInfo *pInfo;
|
||||
int32_t headerSize;
|
||||
|
||||
const uint8_t *inBytes;
|
||||
uint8_t *outBytes;
|
||||
|
||||
/* udata_swapDataHeader checks the arguments */
|
||||
headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* check data format and format version */
|
||||
pInfo=(const UDataInfo *)((const char *)inData+4);
|
||||
if(!(
|
||||
pInfo->dataFormat[0]==0x70 && /* dataFormat="pnam" */
|
||||
pInfo->dataFormat[1]==0x6e &&
|
||||
pInfo->dataFormat[2]==0x61 &&
|
||||
pInfo->dataFormat[3]==0x6d &&
|
||||
pInfo->formatVersion[0]==1
|
||||
)) {
|
||||
udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1],
|
||||
pInfo->dataFormat[2], pInfo->dataFormat[3],
|
||||
pInfo->formatVersion[0]);
|
||||
*pErrorCode=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
inBytes=(const uint8_t *)inData+headerSize;
|
||||
outBytes=(uint8_t *)outData+headerSize;
|
||||
|
||||
if(length>=0) {
|
||||
length-=headerSize;
|
||||
if(length<(int32_t)sizeof(PropertyAliases)) {
|
||||
udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n",
|
||||
length);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return headerSize+PropertyAliases::swap(ds, inBytes, length, outBytes, pErrorCode);
|
||||
}
|
||||
|
||||
//eof
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2004, International Business Machines
|
||||
* Copyright (c) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
* Created: October 30 2002
|
||||
* Since: ICU 2.4
|
||||
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
|
||||
**********************************************************************
|
||||
*/
|
||||
#ifndef PROPNAME_H
|
||||
|
@ -13,6 +14,7 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "bytetrie.h"
|
||||
#include "udataswp.h"
|
||||
#include "uprops.h"
|
||||
|
||||
|
@ -75,441 +77,134 @@ U_CDECL_END
|
|||
#define PNAME_SIG_2 ((uint8_t)0x61) /* a */
|
||||
#define PNAME_SIG_3 ((uint8_t)0x6D) /* m */
|
||||
|
||||
#define PNAME_FORMAT_VERSION ((int8_t)1) /* formatVersion[0] */
|
||||
|
||||
/**
|
||||
* Swap pnames.icu. See udataswp.h.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
upname_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
class Builder;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* An offset from the start of the pnames data to a contained entity.
|
||||
* This must be a signed value, since negative offsets are used as an
|
||||
* end-of-list marker. Offsets to actual objects are non-zero. A
|
||||
* zero offset indicates an absent entry; this corresponds to aliases
|
||||
* marked "n/a" in the original Unicode data files.
|
||||
*/
|
||||
typedef int16_t Offset; /* must be signed */
|
||||
class PropNameData {
|
||||
public:
|
||||
enum {
|
||||
// Byte offsets from the start of the data, after the generic header.
|
||||
IX_VALUE_MAPS_OFFSET,
|
||||
IX_BYTE_TRIES_OFFSET,
|
||||
IX_NAME_GROUPS_OFFSET,
|
||||
IX_RESERVED3_OFFSET,
|
||||
IX_RESERVED4_OFFSET,
|
||||
IX_TOTAL_SIZE,
|
||||
|
||||
#define MAX_OFFSET 0x7FFF
|
||||
// Other values.
|
||||
IX_MAX_NAME_LENGTH,
|
||||
IX_RESERVED7,
|
||||
IX_COUNT
|
||||
};
|
||||
|
||||
/**
|
||||
* A generic value for a property or property value. Typically an
|
||||
* enum from uchar.h, but sometimes a non-enum value. It must be
|
||||
* large enough to accomodate the largest enum value, which as of this
|
||||
* writing is the largest general category mask. Need not be signed
|
||||
* but may be. Typically it doesn't matter, since the caller will
|
||||
* cast it to the proper type before use. Takes the special value
|
||||
* UCHAR_INVALID_CODE for invalid input.
|
||||
*/
|
||||
typedef int32_t EnumValue;
|
||||
static const char *getPropertyName(int32_t property, int32_t nameChoice);
|
||||
static const char *getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice);
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
/* ValueMap */
|
||||
static int32_t getPropertyEnum(const char *alias);
|
||||
static int32_t getPropertyValueEnum(int32_t property, const char *alias);
|
||||
|
||||
/**
|
||||
* For any top-level property that has named values (binary and
|
||||
* enumerated properties), there is a ValueMap object. This object
|
||||
* maps from enum values to two other maps. One goes from value enums
|
||||
* to value names. The other goes from value names to value enums.
|
||||
*
|
||||
* The value enum values may be contiguous or disjoint. If they are
|
||||
* contiguous then the enumToName_offset is nonzero, and the
|
||||
* ncEnumToName_offset is zero. Vice versa if the value enums are
|
||||
* disjoint.
|
||||
*
|
||||
* There are n of these objects, where n is the number of binary
|
||||
* properties + the number of enumerated properties.
|
||||
*/
|
||||
struct ValueMap {
|
||||
private:
|
||||
static int32_t findProperty(int32_t property);
|
||||
static int32_t findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value);
|
||||
static const char *getName(const char *nameGroup, int32_t nameIndex);
|
||||
static UBool containsName(ByteTrie &trie, const char *name);
|
||||
|
||||
/* -- begin pnames data -- */
|
||||
/* Enum=>name EnumToOffset / NonContiguousEnumToOffset objects. */
|
||||
/* Exactly one of these will be nonzero. */
|
||||
Offset enumToName_offset;
|
||||
Offset ncEnumToName_offset;
|
||||
static int32_t getPropertyOrValueEnum(int32_t byteTrieOffset, const char *alias);
|
||||
|
||||
Offset nameToEnum_offset; /* Name=>enum data */
|
||||
/* -- end pnames data -- */
|
||||
static const int32_t indexes[];
|
||||
static const int32_t valueMaps[];
|
||||
static const uint8_t byteTries[];
|
||||
static const char nameGroups[];
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
/* PropertyAliases class */
|
||||
|
||||
/**
|
||||
* A class encapsulating access to the memory-mapped data representing
|
||||
* property aliases and property value aliases (pnames). The class
|
||||
* MUST have no v-table and declares certain methods inline -- small
|
||||
* methods and methods that are called from only one point.
|
||||
/*
|
||||
* pnames.icu formatVersion 2
|
||||
*
|
||||
* The data members in this class correspond to the in-memory layout
|
||||
* of the header of the pnames data.
|
||||
* formatVersion 2 is new in ICU 4.8.
|
||||
* In ICU 4.8, the pnames.icu data file is used only in ICU4J.
|
||||
* ICU4C 4.8 has the same data structures hardcoded in source/common/propname_data.h.
|
||||
*
|
||||
* For documentation of pnames.icu formatVersion 1 see ICU4C 4.6 (2010-dec-01)
|
||||
* or earlier versions of this header file (source/common/propname.h).
|
||||
*
|
||||
* The pnames.icu begins with the standard ICU DataHeader/UDataInfo.
|
||||
* After that:
|
||||
*
|
||||
* int32_t indexes[8];
|
||||
*
|
||||
* (See the PropNameData::IX_... constants.)
|
||||
*
|
||||
* The first 6 indexes are byte offsets from the beginning of the data
|
||||
* (beginning of indexes[]) to following structures.
|
||||
* The length of each structure is the difference between its offset
|
||||
* and the next one.
|
||||
* All offsets are filled in: Where there is no data between two offsets,
|
||||
* those two offsets are the same.
|
||||
* The last offset (indexes[PropNameData::IX_TOTAL_SIZE]) indicates the
|
||||
* total number of bytes in the file. (Not counting the standard headers.)
|
||||
*
|
||||
* The sixth index (indexes[PropNameData::IX_MAX_NAME_LENGTH]) has the
|
||||
* maximum length of any Unicode property (or property value) alias.
|
||||
* (Without normalization, that is, including underscores etc.)
|
||||
*
|
||||
* int32_t valueMaps[];
|
||||
*
|
||||
* The valueMaps[] begins with a map from UProperty enums to properties,
|
||||
* followed by the per-property value maps from property values to names,
|
||||
* for those properties that have named values.
|
||||
* (Binary & enumerated, plus General_Category_Mask.)
|
||||
*
|
||||
* valueMaps[0] contains the number of UProperty enum ranges.
|
||||
* For each range:
|
||||
* int32_t start, limit -- first and last+1 UProperty enum of a dense range
|
||||
* Followed by (limit-start) pairs of
|
||||
* int32_t nameGroupOffset;
|
||||
* Offset into nameGroups[] for the property's names/aliases.
|
||||
* int32_t valueMapIndex;
|
||||
* Offset of the property's value map in the valueMaps[] array.
|
||||
* If the valueMapIndex is 0, then the property does not have named values.
|
||||
*
|
||||
* For each property's value map:
|
||||
* int32_t byteTrieOffset; -- Offset into byteTries[] for name->value mapping.
|
||||
* int32_t numRanges;
|
||||
* If numRanges is in the range 1..15, then that many ranges of values follow.
|
||||
* Per range:
|
||||
* int32_t start, limit -- first and last+1 UProperty enum of a range
|
||||
* Followed by (limit-start) entries of
|
||||
* int32_t nameGroupOffset;
|
||||
* Offset into nameGroups[] for the property value's names/aliases.
|
||||
* If the nameGroupOffset is 0, then this is not a named value for this property.
|
||||
* (That is, the ranges need not be dense.)
|
||||
* If numRanges is >=0x10, then (numRanges-0x10) sorted values
|
||||
* and then (numRanges-0x10) corresponding nameGroupOffsets follow.
|
||||
* Values are sorted as signed integers.
|
||||
* In this case, the set of values is dense; no nameGroupOffset will be 0.
|
||||
*
|
||||
* For both properties and property values, ranges are sorted by their start/limit values.
|
||||
*
|
||||
* uint8_t byteTries[];
|
||||
*
|
||||
* This is a sequence of ByteTrie structures, byte-serialized tries for
|
||||
* mapping from names/aliases to values.
|
||||
* The first one maps from property names/aliases to UProperty enum constants.
|
||||
* The following ones are indexed by property value map byteTrieOffsets
|
||||
* for mapping each property's names/aliases to their property values.
|
||||
*
|
||||
* char nameGroups[];
|
||||
*
|
||||
* This is a sequence of property name groups.
|
||||
* Each group is a list of names/aliases (invariant-character strings) for
|
||||
* one property or property value, in the order of UCharNameChoice.
|
||||
* The first byte of each group is the number of names in the group.
|
||||
* It is followed by that many NUL-terminated strings.
|
||||
* The first string is for the short name; if there is no short name,
|
||||
* then the first string is empty.
|
||||
* The second string is the long name. Further strings are additional aliases.
|
||||
*
|
||||
* The first name group is for a property rather than a property value,
|
||||
* so that a nameGroupOffset of 0 can be used to indicate "no value"
|
||||
* in a property's sparse value ranges.
|
||||
*/
|
||||
class PropertyAliases {
|
||||
|
||||
/* -- begin pnames data -- */
|
||||
/* Enum=>name EnumToOffset object for binary and enumerated */
|
||||
/* properties */
|
||||
Offset enumToName_offset;
|
||||
|
||||
/* Name=>enum data for binary & enumerated properties */
|
||||
Offset nameToEnum_offset;
|
||||
|
||||
/* Enum=>offset EnumToOffset object mapping enumerated properties */
|
||||
/* to ValueMap objects */
|
||||
Offset enumToValue_offset;
|
||||
|
||||
/* The following are needed by external readers of this data. */
|
||||
/* We don't use them ourselves. */
|
||||
int16_t total_size; /* size in bytes excluding the udata header */
|
||||
Offset valueMap_offset; /* offset to start of array */
|
||||
int16_t valueMap_count; /* number of entries */
|
||||
Offset nameGroupPool_offset; /* offset to start of array */
|
||||
int16_t nameGroupPool_count; /* number of entries (not groups) */
|
||||
Offset stringPool_offset; /* offset to start of pool */
|
||||
int16_t stringPool_count; /* number of strings (not size in bytes) */
|
||||
|
||||
/* -- end pnames data -- */
|
||||
|
||||
friend class ::Builder;
|
||||
|
||||
const ValueMap* getValueMap(EnumValue prop) const;
|
||||
|
||||
const char* chooseNameInGroup(Offset offset,
|
||||
UPropertyNameChoice choice) const;
|
||||
|
||||
public:
|
||||
|
||||
inline const int8_t* getPointer(Offset o) const {
|
||||
return ((const int8_t*) this) + o;
|
||||
}
|
||||
|
||||
inline const int8_t* getPointerNull(Offset o) const {
|
||||
return o ? getPointer(o) : NULL;
|
||||
}
|
||||
|
||||
inline const char* getPropertyName(EnumValue prop,
|
||||
UPropertyNameChoice choice) const;
|
||||
|
||||
inline EnumValue getPropertyEnum(const char* alias) const;
|
||||
|
||||
inline const char* getPropertyValueName(EnumValue prop, EnumValue value,
|
||||
UPropertyNameChoice choice) const;
|
||||
|
||||
inline EnumValue getPropertyValueEnum(EnumValue prop,
|
||||
const char* alias) const;
|
||||
|
||||
static int32_t
|
||||
swap(const UDataSwapper *ds,
|
||||
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
|
||||
UErrorCode *pErrorCode);
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
/* EnumToOffset */
|
||||
|
||||
/**
|
||||
* A generic map from enum values to Offsets. The enum values must be
|
||||
* contiguous, from enumStart to enumLimit. The Offset values may
|
||||
* point to anything.
|
||||
*/
|
||||
class EnumToOffset {
|
||||
|
||||
/* -- begin pnames data -- */
|
||||
EnumValue enumStart;
|
||||
EnumValue enumLimit;
|
||||
Offset _offsetArray; /* [array of enumLimit-enumStart] */
|
||||
/* -- end pnames data -- */
|
||||
|
||||
friend class ::Builder;
|
||||
|
||||
Offset* getOffsetArray() {
|
||||
return &_offsetArray;
|
||||
}
|
||||
|
||||
const Offset* getOffsetArray() const {
|
||||
return &_offsetArray;
|
||||
}
|
||||
|
||||
static int32_t getSize(int32_t n) {
|
||||
return sizeof(EnumToOffset) + sizeof(Offset) * (n - 1);
|
||||
}
|
||||
|
||||
int32_t getSize() {
|
||||
return getSize(enumLimit - enumStart);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
Offset getOffset(EnumValue enumProbe) const {
|
||||
if (enumProbe < enumStart ||
|
||||
enumProbe >= enumLimit) {
|
||||
return 0; /* not found */
|
||||
}
|
||||
const Offset* p = getOffsetArray();
|
||||
return p[enumProbe - enumStart];
|
||||
}
|
||||
|
||||
static int32_t
|
||||
swap(const UDataSwapper *ds,
|
||||
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
|
||||
uint8_t *temp, int32_t pos,
|
||||
UErrorCode *pErrorCode);
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
/* NonContiguousEnumToOffset */
|
||||
|
||||
/**
|
||||
* A generic map from enum values to Offsets. The enum values may be
|
||||
* disjoint. If they are contiguous, an EnumToOffset should be used
|
||||
* instead. The Offset values may point to anything.
|
||||
*/
|
||||
class NonContiguousEnumToOffset {
|
||||
|
||||
/* -- begin pnames data -- */
|
||||
int32_t count;
|
||||
EnumValue _enumArray; /* [array of count] */
|
||||
/* Offset _offsetArray; // [array of count] after enumValue[count-1] */
|
||||
/* -- end pnames data -- */
|
||||
|
||||
friend class ::Builder;
|
||||
|
||||
EnumValue* getEnumArray() {
|
||||
return &_enumArray;
|
||||
}
|
||||
|
||||
const EnumValue* getEnumArray() const {
|
||||
return &_enumArray;
|
||||
}
|
||||
|
||||
Offset* getOffsetArray() {
|
||||
return (Offset*) (getEnumArray() + count);
|
||||
}
|
||||
|
||||
const Offset* getOffsetArray() const {
|
||||
return (Offset*) (getEnumArray() + count);
|
||||
}
|
||||
|
||||
static int32_t getSize(int32_t n) {
|
||||
return sizeof(int32_t) + (sizeof(EnumValue) + sizeof(Offset)) * n;
|
||||
}
|
||||
|
||||
int32_t getSize() {
|
||||
return getSize(count);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
Offset getOffset(EnumValue enumProbe) const {
|
||||
const EnumValue* e = getEnumArray();
|
||||
const Offset* p = getOffsetArray();
|
||||
/* linear search; binary later if warranted */
|
||||
/* (binary is not faster for short lists) */
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
if (e[i] < enumProbe) continue;
|
||||
if (e[i] > enumProbe) break;
|
||||
return p[i];
|
||||
}
|
||||
return 0; /* not found */
|
||||
}
|
||||
|
||||
static int32_t
|
||||
swap(const UDataSwapper *ds,
|
||||
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
|
||||
uint8_t *temp, int32_t pos,
|
||||
UErrorCode *pErrorCode);
|
||||
};
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
/* NameToEnum */
|
||||
|
||||
/**
|
||||
* A map from names to enum values.
|
||||
*/
|
||||
class NameToEnum {
|
||||
|
||||
/* -- begin pnames data -- */
|
||||
int32_t count; /* number of entries */
|
||||
EnumValue _enumArray; /* [array of count] EnumValues */
|
||||
/* Offset _nameArray; // [array of count] offsets to names */
|
||||
/* -- end pnames data -- */
|
||||
|
||||
friend class ::Builder;
|
||||
|
||||
EnumValue* getEnumArray() {
|
||||
return &_enumArray;
|
||||
}
|
||||
|
||||
const EnumValue* getEnumArray() const {
|
||||
return &_enumArray;
|
||||
}
|
||||
|
||||
Offset* getNameArray() {
|
||||
return (Offset*) (getEnumArray() + count);
|
||||
}
|
||||
|
||||
const Offset* getNameArray() const {
|
||||
return (Offset*) (getEnumArray() + count);
|
||||
}
|
||||
|
||||
static int32_t getSize(int32_t n) {
|
||||
return sizeof(int32_t) + (sizeof(Offset) + sizeof(EnumValue)) * n;
|
||||
}
|
||||
|
||||
int32_t getSize() {
|
||||
return getSize(count);
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
EnumValue getEnum(const char* alias, const PropertyAliases& data) const {
|
||||
|
||||
const Offset* n = getNameArray();
|
||||
const EnumValue* e = getEnumArray();
|
||||
|
||||
/* linear search; binary later if warranted */
|
||||
/* (binary is not faster for short lists) */
|
||||
for (int32_t i=0; i<count; ++i) {
|
||||
const char* name = (const char*) data.getPointer(n[i]);
|
||||
int32_t c = uprv_comparePropertyNames(alias, name);
|
||||
if (c > 0) continue;
|
||||
if (c < 0) break;
|
||||
return e[i];
|
||||
}
|
||||
|
||||
return UCHAR_INVALID_CODE;
|
||||
}
|
||||
|
||||
static int32_t
|
||||
swap(const UDataSwapper *ds,
|
||||
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
|
||||
uint8_t *temp, int32_t pos,
|
||||
UErrorCode *pErrorCode);
|
||||
};
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
*
|
||||
* In-memory layout. THIS IS NOT A STANDALONE DOCUMENT. It goes
|
||||
* together with above C++ declarations and gives an overview.
|
||||
*
|
||||
* See above for definitions of Offset and EnumValue. Also, refer to
|
||||
* above class declarations for the "bottom line" on data layout.
|
||||
*
|
||||
* Sizes:
|
||||
* '*_offset' is an Offset (see above)
|
||||
* 'count' members are typically int32_t (see above declarations)
|
||||
* 'enumArray' is an array of EnumValue (see above)
|
||||
* 'offsetArray' is an array of Offset (see above)
|
||||
* 'nameArray' is an array of Offset (see above)
|
||||
* 'enum*' is an EnumValue (see above)
|
||||
* '*Array [x n]' means that *Array has n elements
|
||||
*
|
||||
* References:
|
||||
* Instead of pointers, this flat data structure contains offsets.
|
||||
* All offsets are relative to the start of 'header'. A notation
|
||||
* is used to indicate what structure each offset points to:
|
||||
* 'foo (>x)' the offset(s) in foo point to structure x
|
||||
*
|
||||
* Structures:
|
||||
* Each structure is assigned a number, except for the header,
|
||||
* which is called 'header'. The numbers are not contiguous
|
||||
* for historical reasons. Some structures have sub-parts
|
||||
* that are denoted with a letter, e.g., "5a".
|
||||
*
|
||||
* BEGIN LAYOUT
|
||||
* ============
|
||||
* header:
|
||||
* enumToName_offset (>0)
|
||||
* nameToEnum_offset (>2)
|
||||
* enumToValue_offset (>3)
|
||||
* (alignment padding build in to header)
|
||||
*
|
||||
* The header also contains the following, used by "external readers"
|
||||
* like ICU4J and icuswap.
|
||||
*
|
||||
* // The following are needed by external readers of this data.
|
||||
* // We don't use them ourselves.
|
||||
* int16_t total_size; // size in bytes excluding the udata header
|
||||
* Offset valueMap_offset; // offset to start of array
|
||||
* int16_t valueMap_count; // number of entries
|
||||
* Offset nameGroupPool_offset; // offset to start of array
|
||||
* int16_t nameGroupPool_count; // number of entries (not groups)
|
||||
* Offset stringPool_offset; // offset to start of pool
|
||||
* int16_t stringPool_count; // number of strings (not size in bytes)
|
||||
*
|
||||
* 0: # NonContiguousEnumToOffset obj for props => name groups
|
||||
* count
|
||||
* enumArray [x count]
|
||||
* offsetArray [x count] (>98)
|
||||
*
|
||||
* => pad to next 4-byte boundary
|
||||
*
|
||||
* (1: omitted -- no longer used)
|
||||
*
|
||||
* 2: # NameToEnum obj for binary & enumerated props
|
||||
* count
|
||||
* enumArray [x count]
|
||||
* nameArray [x count] (>99)
|
||||
*
|
||||
* => pad to next 4-byte boundary
|
||||
*
|
||||
* 3: # NonContiguousEnumToOffset obj for enumerated props => ValueMaps
|
||||
* count
|
||||
* enumArray [x count]
|
||||
* offsetArray [x count] (>4)
|
||||
*
|
||||
* => pad to next 4-byte boundary
|
||||
*
|
||||
* 4: # ValueMap array [x one for each enumerated prop i]
|
||||
* enumToName_offset (>5a +2*i) one of these two is NULL, one is not
|
||||
* ncEnumToName_offset (>5b +2*i)
|
||||
* nameToEnums_offset (>6 +2*i)
|
||||
*
|
||||
* => pad to next 4-byte boundary
|
||||
*
|
||||
* for each enumerated prop (either 5a or 5b):
|
||||
*
|
||||
* 5a: # EnumToOffset for enumerated prop's values => name groups
|
||||
* enumStart
|
||||
* enumLimit
|
||||
* offsetArray [x enumLimit - enumStart] (>98)
|
||||
*
|
||||
* => pad to next 4-byte boundary
|
||||
*
|
||||
* 5b: # NonContiguousEnumToOffset for enumerated prop's values => name groups
|
||||
* count
|
||||
* enumArray [x count]
|
||||
* offsetArray [x count] (>98)
|
||||
*
|
||||
* => pad to next 4-byte boundary
|
||||
*
|
||||
* 6: # NameToEnum for enumerated prop's values
|
||||
* count
|
||||
* enumArray [x count]
|
||||
* nameArray [x count] (>99)
|
||||
*
|
||||
* => pad to next 4-byte boundary
|
||||
*
|
||||
* 98: # name group pool {NGP}
|
||||
* [array of Offset values] (>99)
|
||||
*
|
||||
* 99: # string pool {SP}
|
||||
* [pool of nul-terminated char* strings]
|
||||
*/
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* C++ */
|
||||
|
||||
#endif
|
||||
|
|
1240
icu4c/source/common/propname_data.h
Normal file
1240
icu4c/source/common/propname_data.h
Normal file
File diff suppressed because it is too large
Load diff
|
@ -34,11 +34,6 @@ typedef struct UBiDiProps UBiDiProps;
|
|||
U_CFUNC const UBiDiProps *
|
||||
ubidi_getSingleton(void);
|
||||
|
||||
U_CAPI int32_t
|
||||
ubidi_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void
|
||||
ubidi_addPropertyStarts(const UBiDiProps *bdp, const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
|
|
|
@ -34,11 +34,6 @@ typedef struct UCaseProps UCaseProps;
|
|||
U_CAPI const UCaseProps * U_EXPORT2
|
||||
ucase_getSingleton(void);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void U_EXPORT2
|
||||
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
|
|
|
@ -44,7 +44,6 @@ typedef enum ECleanupCommonType {
|
|||
UCLN_COMMON_NORMALIZER2,
|
||||
UCLN_COMMON_USET,
|
||||
UCLN_COMMON_UNAMES,
|
||||
UCLN_COMMON_PNAME,
|
||||
UCLN_COMMON_UPROPS,
|
||||
UCLN_COMMON_UCNV,
|
||||
UCLN_COMMON_UCNV_IO,
|
||||
|
|
83
icu4c/source/common/udicttrie.h
Normal file
83
icu4c/source/common/udicttrie.h
Normal file
|
@ -0,0 +1,83 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: udicttrie.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010dec17
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __UDICTTRIE_H__
|
||||
#define __UDICTTRIE_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: Helper definitions for dictionary trie APIs.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
/**
|
||||
* Return values for ByteTrie::next(), UCharTrie::next() and similar methods.
|
||||
* @see UDICTTRIE_RESULT_MATCHES
|
||||
* @see UDICTTRIE_RESULT_HAS_VALUE
|
||||
* @see UDICTTRIE_RESULT_HAS_NEXT
|
||||
*/
|
||||
enum UDictTrieResult {
|
||||
/**
|
||||
* The input unit(s) did not continue a matching string.
|
||||
*/
|
||||
UDICTTRIE_NO_MATCH,
|
||||
/**
|
||||
* The input unit(s) continued a matching string
|
||||
* but there is no value for the string so far.
|
||||
* (It is a prefix of a longer string.)
|
||||
*/
|
||||
UDICTTRIE_NO_VALUE,
|
||||
/**
|
||||
* The input unit(s) continued a matching string
|
||||
* and there is a value for the string so far.
|
||||
* This value will be returned by getValue().
|
||||
* No further input byte/unit can continue a matching string.
|
||||
*/
|
||||
UDICTTRIE_HAS_FINAL_VALUE,
|
||||
/**
|
||||
* The input unit(s) continued a matching string
|
||||
* and there is a value for the string so far.
|
||||
* This value will be returned by getValue().
|
||||
* Another input byte/unit can continue a matching string.
|
||||
*/
|
||||
UDICTTRIE_HAS_VALUE
|
||||
};
|
||||
|
||||
/**
|
||||
* Same as (result!=UDICTTRIE_NO_MATCH).
|
||||
* @param result A result from ByteTrie::first(), UCharTrie::next() etc.
|
||||
* @return true if the input bytes/units so far are part of a matching string/byte sequence.
|
||||
*/
|
||||
#define UDICTTRIE_RESULT_MATCHES(result) ((result)!=UDICTTRIE_NO_MATCH)
|
||||
|
||||
/**
|
||||
* Equivalent to (result==UDICTTRIE_HAS_VALUE || result==UDICTTRIE_HAS_FINAL_VALUE) but
|
||||
* this macro evaluates result exactly once.
|
||||
* @param result A result from ByteTrie::first(), UCharTrie::next() etc.
|
||||
* @return true if there is a value for the input bytes/units so far.
|
||||
* @see ByteTrie::getValue
|
||||
* @see UCharTrie::getValue
|
||||
*/
|
||||
#define UDICTTRIE_RESULT_HAS_VALUE(result) ((result)>=UDICTTRIE_HAS_FINAL_VALUE)
|
||||
|
||||
/**
|
||||
* Equivalent to (result==UDICTTRIE_NO_VALUE || result==UDICTTRIE_HAS_VALUE) but
|
||||
* this macro evaluates result exactly once.
|
||||
* @param result A result from ByteTrie::first(), UCharTrie::next() etc.
|
||||
* @return true if another input byte/unit can continue a matching string.
|
||||
*/
|
||||
#define UDICTTRIE_RESULT_HAS_NEXT(result) ((result)&1)
|
||||
|
||||
#endif /* __UDICTTRIE_H__ */
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1997-2009, International Business Machines
|
||||
* Copyright (C) 1997-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
* Date Name Description
|
||||
|
@ -866,6 +866,11 @@ uhash_hashUCharsN(const UChar *str, int32_t length) {
|
|||
STRING_HASH(UChar, str, length, *p);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashCharsN(const char *str, int32_t length) {
|
||||
STRING_HASH(char, str, length, *p);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashChars(const UHashTok key) {
|
||||
STRING_HASH(uint8_t, key.pointer, uprv_strlen((char*)p), *p);
|
||||
|
|
|
@ -583,6 +583,9 @@ uhash_hashChars(const UHashTok key);
|
|||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashUCharsN(const UChar *key, int32_t length);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uhash_hashCharsN(const char *key, int32_t length);
|
||||
|
||||
/**
|
||||
* Generate a case-insensitive hash code for a null-terminated char*
|
||||
* string. If the string is not null-terminated do not use this
|
||||
|
|
|
@ -104,6 +104,29 @@ static const uint8_t ebcdicFromAscii[256]={
|
|||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
/* Same as asciiFromEbcdic[] except maps all letters to lowercase. */
|
||||
static const uint8_t lowercaseAsciiFromEbcdic[256]={
|
||||
0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
|
||||
0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
|
||||
|
||||
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
|
||||
0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
|
||||
0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
|
||||
|
||||
0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
|
||||
0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
|
||||
|
||||
0x7b, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x7d, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x7c, 0x00, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
/*
|
||||
* Bit sets indicating which characters of the ASCII repertoire
|
||||
* (by ASCII/Unicode code) are "invariant".
|
||||
|
@ -535,6 +558,10 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) {
|
|||
}
|
||||
}
|
||||
|
||||
U_CAPI char U_EXPORT2
|
||||
uprv_ebcdicToLowercaseAscii(char c) {
|
||||
return (char)lowercaseAsciiFromEbcdic[(uint8_t)c];
|
||||
}
|
||||
|
||||
U_INTERNAL uint8_t* U_EXPORT2
|
||||
uprv_aestrncpy(uint8_t *dst, const uint8_t *src, int32_t n)
|
||||
|
|
|
@ -83,6 +83,26 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2);
|
|||
# error Unknown charset family!
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Converts an EBCDIC invariant character to lowercase ASCII.
|
||||
* @internal
|
||||
*/
|
||||
U_INTERNAL char U_EXPORT2
|
||||
uprv_ebcdicToLowercaseAscii(char c);
|
||||
|
||||
/**
|
||||
* \def uprv_invCharToLowercaseAscii
|
||||
* Converts an invariant character to lowercase ASCII.
|
||||
* @internal
|
||||
*/
|
||||
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
|
||||
# define uprv_invCharToLowercaseAscii uprv_asciitolower
|
||||
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
|
||||
# define uprv_invCharToLowercaseAscii uprv_ebcdicToLowercaseAscii
|
||||
#else
|
||||
# error Unknown charset family!
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Copy EBCDIC to ASCII
|
||||
* @internal
|
||||
|
|
|
@ -1160,7 +1160,6 @@
|
|||
#define uplug_setPlugLevel U_ICU_ENTRY_POINT_RENAME(uplug_setPlugLevel)
|
||||
#define uplug_setPlugName U_ICU_ENTRY_POINT_RENAME(uplug_setPlugName)
|
||||
#define uplug_setPlugNoUnload U_ICU_ENTRY_POINT_RENAME(uplug_setPlugNoUnload)
|
||||
#define upname_swap U_ICU_ENTRY_POINT_RENAME(upname_swap)
|
||||
#define uprops_getSource U_ICU_ENTRY_POINT_RENAME(uprops_getSource)
|
||||
#define upropsvec_addPropertyStarts U_ICU_ENTRY_POINT_RENAME(upropsvec_addPropertyStarts)
|
||||
#define uprv_aestrncpy U_ICU_ENTRY_POINT_RENAME(uprv_aestrncpy)
|
||||
|
|
|
@ -162,15 +162,6 @@ enum {
|
|||
UNORM_NX_CJK_COMPAT=2
|
||||
};
|
||||
|
||||
/**
|
||||
* Swap unorm.icu. See udataswp.h.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Description of the format of unorm.icu version 2.3.
|
||||
*
|
||||
|
|
|
@ -397,15 +397,6 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
|
|||
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
*/
|
||||
|
||||
/**
|
||||
* Swap the ICU Unicode properties file. See uchar.c.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uprops_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Swap the ICU Unicode character names file. See uchar.c.
|
||||
* @internal
|
||||
|
|
3
icu4c/source/configure
vendored
3
icu4c/source/configure
vendored
|
@ -7748,7 +7748,7 @@ then
|
|||
fi
|
||||
|
||||
# output the Makefiles
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/dicttrieperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
|
||||
|
||||
cat >confcache <<\_ACEOF
|
||||
# This file is a shell script that caches the results of configure
|
||||
|
@ -8489,6 +8489,7 @@ do
|
|||
"test/letest/Makefile") CONFIG_FILES="$CONFIG_FILES test/letest/Makefile" ;;
|
||||
"test/perf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/Makefile" ;;
|
||||
"test/perf/collationperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/collationperf/Makefile" ;;
|
||||
"test/perf/dicttrieperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/dicttrieperf/Makefile" ;;
|
||||
"test/perf/ubrkperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/ubrkperf/Makefile" ;;
|
||||
"test/perf/charperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/charperf/Makefile" ;;
|
||||
"test/perf/convperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/convperf/Makefile" ;;
|
||||
|
|
|
@ -1354,6 +1354,7 @@ AC_CONFIG_FILES([icudefs.mk \
|
|||
test/letest/Makefile \
|
||||
test/perf/Makefile \
|
||||
test/perf/collationperf/Makefile \
|
||||
test/perf/dicttrieperf/Makefile \
|
||||
test/perf/ubrkperf/Makefile \
|
||||
test/perf/charperf/Makefile \
|
||||
test/perf/convperf/Makefile \
|
||||
|
|
|
@ -226,8 +226,10 @@ package390: $(OUTTMPDIR)/icudata390.lst $(PKGDATA_LIST) ./icupkg.inc packagedata
|
|||
## DAT files - Misc. data files.
|
||||
# 2005-may-05 Removed Unicode properties files (unorm.icu, uprops.icu, ucase.icu, ubidi.icu)
|
||||
# from data build. See Jitterbug 4497. (makedata.mak revision 1.117)
|
||||
# 2010-dec Removed pnames.icu.
|
||||
# These are now hardcoded in ICU4C and only loaded in ICU4J.
|
||||
#
|
||||
DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu nfc.nrm nfkc.nrm nfkc_cf.nrm uts46.nrm
|
||||
DAT_FILES_SHORT=unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu nfc.nrm nfkc.nrm nfkc_cf.nrm uts46.nrm
|
||||
DAT_FILES=$(DAT_FILES_SHORT:%=$(BUILDDIR)/%)
|
||||
|
||||
## BRK files
|
||||
|
@ -411,7 +413,7 @@ COLL_FILES_LIST=$(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT)
|
|||
BRK_FILES_LIST=$(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT)
|
||||
LOCALE_FILES_LIST= $(RES_FILES_SHORT) $(LANG_FILES_SHORT) $(REGION_FILES_SHORT) $(ZONE_FILES_SHORT)
|
||||
MISC_FILES_LIST=$(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(CNV_FILES_SHORT_SPECIAL) $(CURR_FILES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) $(CFU_FILES_SHORT)
|
||||
UNI_CORE_DATA=uprops.icu ucase.icu ubidi.icu
|
||||
UNI_CORE_DATA=pnames.icu uprops.icu ucase.icu ubidi.icu
|
||||
UNI_CORE_TARGET_DATA=$(UNI_CORE_DATA:%=$(BUILDDIR)/%)
|
||||
|
||||
ifneq ($(INCLUDE_UNI_CORE_DATA),)
|
||||
|
@ -494,7 +496,7 @@ $(BUILDDIR)/coll/%.icu: $(SRCDATADIR)/in/coll/%.icu
|
|||
#################################################### SPP
|
||||
# SPP FILES
|
||||
|
||||
$(BUILDDIR)/%.spp: $(SPREPSRCDIR)/%.txt $(TOOLBINDIR)/gensprep$(TOOLEXEEXT) $(BUILDDIR)/unames.icu $(BUILDDIR)/pnames.icu
|
||||
$(BUILDDIR)/%.spp: $(SPREPSRCDIR)/%.txt $(TOOLBINDIR)/gensprep$(TOOLEXEEXT) $(BUILDDIR)/unames.icu
|
||||
$(INVOKE) $(TOOLBINDIR)/gensprep -d $(BUILDDIR) -i $(BUILDDIR) -s $(SPREPSRCDIR) -b $(@F:%.spp=%) -m $(UNICODEDATADIR) -u 3.2.0 $(<F)
|
||||
|
||||
#################################################### BRK
|
||||
|
@ -753,11 +755,10 @@ clean-resindex:
|
|||
$(BUILDDIR)/$(INDEX_NAME).res: $(INDEX_FILE) $(TOOLBINDIR)/genrb$(TOOLEXEEXT)
|
||||
$(INVOKE) $(TOOLBINDIR)/genrb $(GENRBOPTS) -i $(BUILDDIR) -d $(BUILDDIR) $(INDEX_FILE)
|
||||
|
||||
# The core Unicode properties files (uprops.icu, ucase.icu, ubidi.icu)
|
||||
# The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu)
|
||||
# are hardcoded in the common DLL and therefore not included in the data package any more.
|
||||
# They are not built by default but need to be built for ICU4J data and for getting the .c source files
|
||||
# when updating the Unicode data.
|
||||
# Changed in Makefile.in revision 1.147. See Jitterbug 4497.
|
||||
uni-core-data: build-dir $(UNI_CORE_TARGET_DATA)
|
||||
@echo Unicode .icu files built to $(BUILDDIR)
|
||||
|
||||
|
@ -778,7 +779,7 @@ JAR=jar
|
|||
# - package them into the .jar file
|
||||
$(OUTDIR)/icu4j/icudata.jar: build-dir packagedata $(OUTTMPDIR)/$(ICUDATA_PLATFORM_NAME).dat uni-core-data
|
||||
mkdir -p $(OUTDIR)/icu4j/com/ibm/icu/impl/data/$(ICUDATA_BASENAME_VERSION)b
|
||||
echo ubidi.icu ucase.icu uprops.icu > $(OUTDIR)/icu4j/add.txt
|
||||
echo pnames.icu ubidi.icu ucase.icu uprops.icu > $(OUTDIR)/icu4j/add.txt
|
||||
$(INVOKE) $(TOOLBINDIR)/icupkg $(OUTTMPDIR)/$(ICUDATA_PLATFORM_NAME).dat $(OUTDIR)/icu4j/$(ICUDATA_BASENAME_VERSION)b.dat -a $(OUTDIR)/icu4j/add.txt -s $(BUILDDIR) -x '*' -tb -d $(OUTDIR)/icu4j/com/ibm/icu/impl/data/$(ICUDATA_BASENAME_VERSION)b
|
||||
$(JAR) cf $(OUTDIR)/icu4j/icudata.jar -C $(OUTDIR)/icu4j com/ibm/icu/impl/data/$(ICUDATA_BASENAME_VERSION)b
|
||||
|
||||
|
@ -821,9 +822,9 @@ pkgdataMakefile:
|
|||
###########
|
||||
########### 390 (z/OS) support
|
||||
UCMFILES390=ebcdic-xml-us.ucm ibm-37_P100-1995.ucm ibm-1047_P100-1995.ucm ibm-4909_P100-1999.ucm
|
||||
# used to depend on uprops.icu ucase.icu ubidi.icu
|
||||
# see Jitterbug 4497
|
||||
ALLFILES390=pnames.icu cnvalias.icu $(UCMFILES390:.ucm=.cnv)
|
||||
# used to depend on pnames.icu uprops.icu ucase.icu ubidi.icu
|
||||
# These are now hardcoded in ICU4C and only loaded in ICU4J.
|
||||
ALLFILES390=cnvalias.icu $(UCMFILES390:.ucm=.cnv)
|
||||
|
||||
$(OUTTMPDIR)/icudata390.lst: $(SRCLISTDEPS)
|
||||
@echo "generating $@ (list of 390 data files)"
|
||||
|
|
Binary file not shown.
|
@ -486,9 +486,10 @@ ALL : GODATA "$(ICU_LIB_TARGET)" "$(TESTDATAOUT)\testdata.dat"
|
|||
# They are not built by default but need to be built for ICU4J data and for getting the .c source files
|
||||
# when updating the Unicode data.
|
||||
# Changed in makedata.mak revision 1.117. See Jitterbug 4497.
|
||||
# 2010-dec Removed pnames.icu.
|
||||
# Command line:
|
||||
# C:\svn\icuproj\icu\trunk\source\data>nmake -f makedata.mak ICUMAKE=C:\svn\icuproj\icu\trunk\source\data\ CFG=x86\Debug uni-core-data
|
||||
uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu"
|
||||
uni-core-data: GODATA "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu"
|
||||
@echo Unicode .icu files built to "$(ICUBLD_PKG)"
|
||||
|
||||
# Build the ICU4J icudata.jar and testdata.jar.
|
||||
|
@ -501,7 +502,7 @@ uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(IC
|
|||
# - package them into the .jar file
|
||||
"$(ICUOUT)\icu4j\icudata.jar": GODATA "$(ICUOUT)\$(ICUPKG).dat" uni-core-data
|
||||
if not exist "$(ICUOUT)\icu4j\com\ibm\icu\impl\data\$(U_ICUDATA_NAME)b" mkdir "$(ICUOUT)\icu4j\com\ibm\icu\impl\data\$(U_ICUDATA_NAME)b"
|
||||
echo ubidi.icu ucase.icu uprops.icu > "$(ICUOUT)\icu4j\add.txt"
|
||||
echo pnames.icu ubidi.icu ucase.icu uprops.icu > "$(ICUOUT)\icu4j\add.txt"
|
||||
"$(ICUPBIN)\icupkg" "$(ICUOUT)\$(ICUPKG).dat" "$(ICUOUT)\icu4j\$(U_ICUDATA_NAME)b.dat" -a "$(ICUOUT)\icu4j\add.txt" -s "$(ICUBLD_PKG)" -x * -tb -d "$(ICUOUT)\icu4j\com\ibm\icu\impl\data\$(U_ICUDATA_NAME)b"
|
||||
"$(JAR)" cf "$(ICUOUT)\icu4j\icudata.jar" -C "$(ICUOUT)\icu4j" com\ibm\icu\impl\data\$(U_ICUDATA_NAME)b
|
||||
|
||||
|
@ -586,11 +587,10 @@ icu4j-data-install :
|
|||
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
|
||||
-@erase "$(ICUTMP)\$(ICUPKG).dat"
|
||||
!ELSE
|
||||
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
|
||||
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
|
||||
@echo Building icu data
|
||||
cd "$(ICUBLD_PKG)"
|
||||
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
|
||||
pnames.icu
|
||||
unames.icu
|
||||
confusables.cfu
|
||||
$(ICUCOL)\ucadata.icu
|
||||
|
@ -985,9 +985,8 @@ $(UCM_SOURCE_SPECIAL): {"$(ICUTOOLS)\makeconv\$(CFG)"}makeconv.exe
|
|||
# See Jitterbug 4497 for details.
|
||||
$(MISC_SOURCE) $(RB_FILES) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(BRK_RES_FILES) $(TRANSLIT_RES_FILES): {"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu"
|
||||
|
||||
# This used to depend on "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu"
|
||||
# This data is now hard coded as a part of the library.
|
||||
# See Jitterbug 4497 for details.
|
||||
$(BRK_SOURCE) : "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\nfc.nrm"
|
||||
# This used to depend on "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu"
|
||||
# These are now hardcoded in ICU4C and only loaded in ICU4J.
|
||||
$(BRK_SOURCE) : "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\nfc.nrm"
|
||||
!ENDIF
|
||||
|
||||
|
|
|
@ -13,6 +13,20 @@
|
|||
|
||||
---------------------------------------------------------------------------- ***
|
||||
|
||||
Unicode 6.1 update
|
||||
|
||||
(TODO: Copy and adjust most of the 6.0 update instructions,
|
||||
except retain this following section in this new form.
|
||||
So far, this just documents the new procedure for building the property names data.)
|
||||
|
||||
* run genpname
|
||||
(builds both pnames.icu and propname_data.h)
|
||||
- ~/svn.icu/tools/trunk/bld/unicode$ c/genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in
|
||||
- ~/svn.icu/tools/trunk/bld/unicode$ c/genpname/genpname -v -d ~/svn.icu/trunk/src/source/common --csource
|
||||
- rebuild ICU & tools
|
||||
|
||||
---------------------------------------------------------------------------- ***
|
||||
|
||||
Unicode 6.0 update
|
||||
|
||||
*** related ICU Trac tickets
|
||||
|
|
|
@ -52,7 +52,6 @@
|
|||
#include "ucol_swp.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "sprpimpl.h"
|
||||
#include "propname.h"
|
||||
#include "rbbidata.h"
|
||||
|
||||
/* swapping implementation in i18n */
|
||||
|
@ -1310,10 +1309,16 @@ static const struct {
|
|||
{"thaidict", "ctd", triedict_swap},
|
||||
#endif
|
||||
|
||||
/* the last item should not be #if'ed so that it can reliably omit the last comma */
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Starting with ICU 4.8, the Unicode property (value) aliases data
|
||||
* is hardcoded in the ICU4C common library.
|
||||
* The swapper was moved to the toolutil library for swapping for ICU4J.
|
||||
*/
|
||||
/* Unicode properties */
|
||||
{"pnames", "icu", upname_swap},
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Starting with ICU4C 3.4, the core Unicode properties files
|
||||
|
@ -1336,6 +1341,7 @@ static const struct {
|
|||
{"confusables", "cfu", uspoof_swap},
|
||||
#endif
|
||||
{"unames", "icu", uchar_swapNames}
|
||||
/* the last item should not be #if'ed so that it can reliably omit the last comma */
|
||||
};
|
||||
|
||||
/* Large enough for the largest swappable data item. */
|
||||
|
@ -1673,6 +1679,7 @@ TestSwapData() {
|
|||
uprv_strcat(name, swapCases[i].type);
|
||||
|
||||
pData=udata_open(pkg, swapCases[i].type, nm, &errorCode);
|
||||
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
TestSwapCase(pData, name, swapCases[i].swapFn, buffer, buffer+SWAP_BUFFER_SIZE);
|
||||
udata_close(pData);
|
||||
|
|
|
@ -50,6 +50,7 @@ sdtfmtts.o svccoll.o tchcfmt.o selfmts.o \
|
|||
tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
|
||||
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
|
||||
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
|
||||
bytetrietest.o uchartrietest.o \
|
||||
itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
|
||||
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
|
||||
jamotest.o srchtest.o reptest.o regextst.o \
|
||||
|
|
843
icu4c/source/test/intltest/bytetrietest.cpp
Normal file
843
icu4c/source/test/intltest/bytetrietest.cpp
Normal file
|
@ -0,0 +1,843 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetrietest.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov16
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "bytetrie.h"
|
||||
#include "bytetriebuilder.h"
|
||||
#include "bytetrieiterator.h"
|
||||
#include "intltest.h"
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
struct StringAndValue {
|
||||
const char *s;
|
||||
int32_t value;
|
||||
};
|
||||
|
||||
class ByteTrieTest : public IntlTest {
|
||||
public:
|
||||
ByteTrieTest() {}
|
||||
virtual ~ByteTrieTest();
|
||||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
|
||||
void TestBuilder();
|
||||
void TestEmpty();
|
||||
void Test_a();
|
||||
void Test_a_ab();
|
||||
void TestShortestBranch();
|
||||
void TestBranches();
|
||||
void TestLongSequence();
|
||||
void TestLongBranch();
|
||||
void TestValuesForState();
|
||||
void TestCompact();
|
||||
|
||||
StringPiece buildMonthsTrie(ByteTrieBuilder &builder, UDictTrieBuildOption buildOption);
|
||||
void TestHasUniqueValue();
|
||||
void TestGetNextBytes();
|
||||
void TestIteratorFromBranch();
|
||||
void TestIteratorFromLinearMatch();
|
||||
void TestTruncatingIteratorFromRoot();
|
||||
void TestTruncatingIteratorFromLinearMatchShort();
|
||||
void TestTruncatingIteratorFromLinearMatchLong();
|
||||
|
||||
void checkData(const StringAndValue data[], int32_t dataLength);
|
||||
void checkData(const StringAndValue data[], int32_t dataLength, UDictTrieBuildOption buildOption);
|
||||
StringPiece buildTrie(const StringAndValue data[], int32_t dataLength,
|
||||
ByteTrieBuilder &builder, UDictTrieBuildOption buildOption);
|
||||
void checkFirst(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
|
||||
void checkNext(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
|
||||
void checkNextWithState(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
|
||||
void checkNextString(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
|
||||
void checkIterator(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
|
||||
void checkIterator(ByteTrieIterator &iter, const StringAndValue data[], int32_t dataLength);
|
||||
};
|
||||
|
||||
extern IntlTest *createByteTrieTest() {
|
||||
return new ByteTrieTest();
|
||||
}
|
||||
|
||||
ByteTrieTest::~ByteTrieTest() {
|
||||
}
|
||||
|
||||
void ByteTrieTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
|
||||
if(exec) {
|
||||
logln("TestSuite ByteTrieTest: ");
|
||||
}
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO(TestBuilder);
|
||||
TESTCASE_AUTO(TestEmpty);
|
||||
TESTCASE_AUTO(Test_a);
|
||||
TESTCASE_AUTO(Test_a_ab);
|
||||
TESTCASE_AUTO(TestShortestBranch);
|
||||
TESTCASE_AUTO(TestBranches);
|
||||
TESTCASE_AUTO(TestLongSequence);
|
||||
TESTCASE_AUTO(TestLongBranch);
|
||||
TESTCASE_AUTO(TestValuesForState);
|
||||
TESTCASE_AUTO(TestCompact);
|
||||
TESTCASE_AUTO(TestHasUniqueValue);
|
||||
TESTCASE_AUTO(TestGetNextBytes);
|
||||
TESTCASE_AUTO(TestIteratorFromBranch);
|
||||
TESTCASE_AUTO(TestIteratorFromLinearMatch);
|
||||
TESTCASE_AUTO(TestTruncatingIteratorFromRoot);
|
||||
TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchShort);
|
||||
TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchLong);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestBuilder() {
|
||||
IcuTestErrorCode errorCode(*this, "TestBuilder()");
|
||||
ByteTrieBuilder builder;
|
||||
builder.build(UDICTTRIE_BUILD_FAST, errorCode);
|
||||
if(errorCode.reset()!=U_INDEX_OUTOFBOUNDS_ERROR) {
|
||||
errln("ByteTrieBuilder().build() did not set U_INDEX_OUTOFBOUNDS_ERROR");
|
||||
return;
|
||||
}
|
||||
builder.add("=", 0, errorCode).add("=", 1, errorCode).build(UDICTTRIE_BUILD_FAST, errorCode);
|
||||
if(errorCode.reset()!=U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
errln("ByteTrieBuilder.build() did not detect duplicates");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestEmpty() {
|
||||
static const StringAndValue data[]={
|
||||
{ "", 0 }
|
||||
};
|
||||
checkData(data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::Test_a() {
|
||||
static const StringAndValue data[]={
|
||||
{ "a", 1 }
|
||||
};
|
||||
checkData(data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::Test_a_ab() {
|
||||
static const StringAndValue data[]={
|
||||
{ "a", 1 },
|
||||
{ "ab", 100 }
|
||||
};
|
||||
checkData(data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestShortestBranch() {
|
||||
static const StringAndValue data[]={
|
||||
{ "a", 1000 },
|
||||
{ "b", 2000 }
|
||||
};
|
||||
checkData(data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestBranches() {
|
||||
static const StringAndValue data[]={
|
||||
{ "a", 0x10 },
|
||||
{ "cc", 0x40 },
|
||||
{ "e", 0x100 },
|
||||
{ "ggg", 0x400 },
|
||||
{ "i", 0x1000 },
|
||||
{ "kkkk", 0x4000 },
|
||||
{ "n", 0x10000 },
|
||||
{ "ppppp", 0x40000 },
|
||||
{ "r", 0x100000 },
|
||||
{ "sss", 0x200000 },
|
||||
{ "t", 0x400000 },
|
||||
{ "uu", 0x800000 },
|
||||
{ "vv", 0x7fffffff },
|
||||
{ "zz", 0x80000000 }
|
||||
};
|
||||
for(int32_t length=2; length<=LENGTHOF(data); ++length) {
|
||||
infoln("TestBranches length=%d", (int)length);
|
||||
checkData(data, length);
|
||||
}
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestLongSequence() {
|
||||
static const StringAndValue data[]={
|
||||
{ "a", -1 },
|
||||
// sequence of linear-match nodes
|
||||
{ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", -2 },
|
||||
// more than 256 bytes
|
||||
{ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", -3 }
|
||||
};
|
||||
checkData(data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestLongBranch() {
|
||||
// Split-branch and interesting compact-integer values.
|
||||
static const StringAndValue data[]={
|
||||
{ "a", -2 },
|
||||
{ "b", -1 },
|
||||
{ "c", 0 },
|
||||
{ "d2", 1 },
|
||||
{ "f", 0x3f },
|
||||
{ "g", 0x40 },
|
||||
{ "h", 0x41 },
|
||||
{ "j23", 0x1900 },
|
||||
{ "j24", 0x19ff },
|
||||
{ "j25", 0x1a00 },
|
||||
{ "k2", 0x1a80 },
|
||||
{ "k3", 0x1aff },
|
||||
{ "l234567890", 0x1b00 },
|
||||
{ "l234567890123", 0x1b01 },
|
||||
{ "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn", 0x10ffff },
|
||||
{ "oooooooooooooooooooooooooooooooooooooooooooooooooooooo", 0x110000 },
|
||||
{ "pppppppppppppppppppppppppppppppppppppppppppppppppppppp", 0x120000 },
|
||||
{ "r", 0x333333 },
|
||||
{ "s2345", 0x4444444 },
|
||||
{ "t234567890", 0x77777777 },
|
||||
{ "z", 0x80000001 }
|
||||
};
|
||||
checkData(data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestValuesForState() {
|
||||
// Check that saveState() and resetToState() interact properly
|
||||
// with next() and current().
|
||||
static const StringAndValue data[]={
|
||||
{ "a", -1 },
|
||||
{ "ab", -2 },
|
||||
{ "abc", -3 },
|
||||
{ "abcd", -4 },
|
||||
{ "abcde", -5 },
|
||||
{ "abcdef", -6 }
|
||||
};
|
||||
checkData(data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestCompact() {
|
||||
// Duplicate trailing strings and values provide opportunities for compacting.
|
||||
static const StringAndValue data[]={
|
||||
{ "+", 0 },
|
||||
{ "+august", 8 },
|
||||
{ "+december", 12 },
|
||||
{ "+july", 7 },
|
||||
{ "+june", 6 },
|
||||
{ "+november", 11 },
|
||||
{ "+october", 10 },
|
||||
{ "+september", 9 },
|
||||
{ "-", 0 },
|
||||
{ "-august", 8 },
|
||||
{ "-december", 12 },
|
||||
{ "-july", 7 },
|
||||
{ "-june", 6 },
|
||||
{ "-november", 11 },
|
||||
{ "-october", 10 },
|
||||
{ "-september", 9 },
|
||||
// The l+n branch (with its sub-nodes) is a duplicate but will be written
|
||||
// both times because each time it follows a different linear-match node.
|
||||
{ "xjuly", 7 },
|
||||
{ "xjune", 6 }
|
||||
};
|
||||
checkData(data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
StringPiece ByteTrieTest::buildMonthsTrie(ByteTrieBuilder &builder, UDictTrieBuildOption buildOption) {
|
||||
// All types of nodes leading to the same value,
|
||||
// for code coverage of recursive functions.
|
||||
// In particular, we need a lot of branches on some single level
|
||||
// to exercise a split-branch node.
|
||||
static const StringAndValue data[]={
|
||||
{ "august", 8 },
|
||||
{ "jan", 1 },
|
||||
{ "jan.", 1 },
|
||||
{ "jana", 1 },
|
||||
{ "janbb", 1 },
|
||||
{ "janc", 1 },
|
||||
{ "janddd", 1 },
|
||||
{ "janee", 1 },
|
||||
{ "janef", 1 },
|
||||
{ "janf", 1 },
|
||||
{ "jangg", 1 },
|
||||
{ "janh", 1 },
|
||||
{ "janiiii", 1 },
|
||||
{ "janj", 1 },
|
||||
{ "jankk", 1 },
|
||||
{ "jankl", 1 },
|
||||
{ "jankmm", 1 },
|
||||
{ "janl", 1 },
|
||||
{ "janm", 1 },
|
||||
{ "jannnnnnnnnnnnnnnnnnnnnnnnnnnnn", 1 },
|
||||
{ "jano", 1 },
|
||||
{ "janpp", 1 },
|
||||
{ "janqqq", 1 },
|
||||
{ "janr", 1 },
|
||||
{ "januar", 1 },
|
||||
{ "january", 1 },
|
||||
{ "july", 7 },
|
||||
{ "jun", 6 },
|
||||
{ "jun.", 6 },
|
||||
{ "june", 6 }
|
||||
};
|
||||
return buildTrie(data, LENGTHOF(data), builder, buildOption);
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestHasUniqueValue() {
|
||||
ByteTrieBuilder builder;
|
||||
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_FAST);
|
||||
if(sp.empty()) {
|
||||
return; // buildTrie() reported an error
|
||||
}
|
||||
ByteTrie trie(sp.data());
|
||||
int32_t uniqueValue;
|
||||
if(trie.hasUniqueValue(uniqueValue)) {
|
||||
errln("unique value at root");
|
||||
}
|
||||
trie.next('j');
|
||||
trie.next('a');
|
||||
trie.next('n');
|
||||
// hasUniqueValue() directly after next()
|
||||
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=1) {
|
||||
errln("not unique value 1 after \"jan\"");
|
||||
}
|
||||
trie.first('j');
|
||||
trie.next('u');
|
||||
if(trie.hasUniqueValue(uniqueValue)) {
|
||||
errln("unique value after \"ju\"");
|
||||
}
|
||||
if(trie.next('n')!=UDICTTRIE_HAS_VALUE || 6!=trie.getValue()) {
|
||||
errln("not normal value 6 after \"jun\"");
|
||||
}
|
||||
// hasUniqueValue() after getValue()
|
||||
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=6) {
|
||||
errln("not unique value 6 after \"jun\"");
|
||||
}
|
||||
// hasUniqueValue() from within a linear-match node
|
||||
trie.first('a');
|
||||
trie.next('u');
|
||||
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=8) {
|
||||
errln("not unique value 8 after \"au\"");
|
||||
}
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestGetNextBytes() {
|
||||
ByteTrieBuilder builder;
|
||||
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_SMALL);
|
||||
if(sp.empty()) {
|
||||
return; // buildTrie() reported an error
|
||||
}
|
||||
ByteTrie trie(sp.data());
|
||||
char buffer[40];
|
||||
CheckedArrayByteSink sink(buffer, LENGTHOF(buffer));
|
||||
int32_t count=trie.getNextBytes(sink);
|
||||
if(count!=2 || sink.NumberOfBytesAppended()!=2 || buffer[0]!='a' || buffer[1]!='j') {
|
||||
errln("months getNextBytes()!=[aj] at root");
|
||||
}
|
||||
trie.next('j');
|
||||
trie.next('a');
|
||||
trie.next('n');
|
||||
// getNextBytes() directly after next()
|
||||
count=trie.getNextBytes(sink.Reset());
|
||||
buffer[count]=0;
|
||||
if(count!=20 || sink.NumberOfBytesAppended()!=20 || 0!=strcmp(buffer, ".abcdefghijklmnopqru")) {
|
||||
errln("months getNextBytes()!=[.abcdefghijklmnopqru] after \"jan\"");
|
||||
}
|
||||
// getNextBytes() after getValue()
|
||||
trie.getValue(); // next() had returned UDICTTRIE_HAS_VALUE.
|
||||
memset(buffer, 0, sizeof(buffer));
|
||||
count=trie.getNextBytes(sink.Reset());
|
||||
if(count!=20 || sink.NumberOfBytesAppended()!=20 || 0!=strcmp(buffer, ".abcdefghijklmnopqru")) {
|
||||
errln("months getNextBytes()!=[.abcdefghijklmnopqru] after \"jan\"+getValue()");
|
||||
}
|
||||
// getNextBytes() from a linear-match node
|
||||
trie.next('u');
|
||||
memset(buffer, 0, sizeof(buffer));
|
||||
count=trie.getNextBytes(sink.Reset());
|
||||
if(count!=1 || sink.NumberOfBytesAppended()!=1 || buffer[0]!='a') {
|
||||
errln("months getNextBytes()!=[a] after \"janu\"");
|
||||
}
|
||||
trie.next('a');
|
||||
memset(buffer, 0, sizeof(buffer));
|
||||
count=trie.getNextBytes(sink.Reset());
|
||||
if(count!=1 || sink.NumberOfBytesAppended()!=1 || buffer[0]!='r') {
|
||||
errln("months getNextBytes()!=[r] after \"janua\"");
|
||||
}
|
||||
trie.next('r');
|
||||
trie.next('y');
|
||||
// getNextBytes() after a final match
|
||||
count=trie.getNextBytes(sink.Reset());
|
||||
if(count!=0 || sink.NumberOfBytesAppended()!=0) {
|
||||
errln("months getNextBytes()!=[] after \"january\"");
|
||||
}
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestIteratorFromBranch() {
|
||||
ByteTrieBuilder builder;
|
||||
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_FAST);
|
||||
if(sp.empty()) {
|
||||
return; // buildTrie() reported an error
|
||||
}
|
||||
ByteTrie trie(sp.data());
|
||||
// Go to a branch node.
|
||||
trie.next('j');
|
||||
trie.next('a');
|
||||
trie.next('n');
|
||||
IcuTestErrorCode errorCode(*this, "TestIteratorFromBranch()");
|
||||
ByteTrieIterator iter(trie, 0, errorCode);
|
||||
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
|
||||
return;
|
||||
}
|
||||
// Expected data: Same as in buildMonthsTrie(), except only the suffixes
|
||||
// following "jan".
|
||||
static const StringAndValue data[]={
|
||||
{ "", 1 },
|
||||
{ ".", 1 },
|
||||
{ "a", 1 },
|
||||
{ "bb", 1 },
|
||||
{ "c", 1 },
|
||||
{ "ddd", 1 },
|
||||
{ "ee", 1 },
|
||||
{ "ef", 1 },
|
||||
{ "f", 1 },
|
||||
{ "gg", 1 },
|
||||
{ "h", 1 },
|
||||
{ "iiii", 1 },
|
||||
{ "j", 1 },
|
||||
{ "kk", 1 },
|
||||
{ "kl", 1 },
|
||||
{ "kmm", 1 },
|
||||
{ "l", 1 },
|
||||
{ "m", 1 },
|
||||
{ "nnnnnnnnnnnnnnnnnnnnnnnnnnnn", 1 },
|
||||
{ "o", 1 },
|
||||
{ "pp", 1 },
|
||||
{ "qqq", 1 },
|
||||
{ "r", 1 },
|
||||
{ "uar", 1 },
|
||||
{ "uary", 1 }
|
||||
};
|
||||
checkIterator(iter, data, LENGTHOF(data));
|
||||
// Reset, and we should get the same result.
|
||||
logln("after iter.reset()");
|
||||
checkIterator(iter.reset(), data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestIteratorFromLinearMatch() {
|
||||
ByteTrieBuilder builder;
|
||||
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_SMALL);
|
||||
if(sp.empty()) {
|
||||
return; // buildTrie() reported an error
|
||||
}
|
||||
ByteTrie trie(sp.data());
|
||||
// Go into a linear-match node.
|
||||
trie.next('j');
|
||||
trie.next('a');
|
||||
trie.next('n');
|
||||
trie.next('u');
|
||||
trie.next('a');
|
||||
IcuTestErrorCode errorCode(*this, "TestIteratorFromLinearMatch()");
|
||||
ByteTrieIterator iter(trie, 0, errorCode);
|
||||
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
|
||||
return;
|
||||
}
|
||||
// Expected data: Same as in buildMonthsTrie(), except only the suffixes
|
||||
// following "janua".
|
||||
static const StringAndValue data[]={
|
||||
{ "r", 1 },
|
||||
{ "ry", 1 }
|
||||
};
|
||||
checkIterator(iter, data, LENGTHOF(data));
|
||||
// Reset, and we should get the same result.
|
||||
logln("after iter.reset()");
|
||||
checkIterator(iter.reset(), data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestTruncatingIteratorFromRoot() {
|
||||
ByteTrieBuilder builder;
|
||||
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_FAST);
|
||||
if(sp.empty()) {
|
||||
return; // buildTrie() reported an error
|
||||
}
|
||||
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromRoot()");
|
||||
ByteTrieIterator iter(sp.data(), 4, errorCode);
|
||||
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
|
||||
return;
|
||||
}
|
||||
// Expected data: Same as in buildMonthsTrie(), except only the first 4 characters
|
||||
// of each string, and no string duplicates from the truncation.
|
||||
static const StringAndValue data[]={
|
||||
{ "augu", -1 },
|
||||
{ "jan", 1 },
|
||||
{ "jan.", 1 },
|
||||
{ "jana", 1 },
|
||||
{ "janb", -1 },
|
||||
{ "janc", 1 },
|
||||
{ "jand", -1 },
|
||||
{ "jane", -1 },
|
||||
{ "janf", 1 },
|
||||
{ "jang", -1 },
|
||||
{ "janh", 1 },
|
||||
{ "jani", -1 },
|
||||
{ "janj", 1 },
|
||||
{ "jank", -1 },
|
||||
{ "janl", 1 },
|
||||
{ "janm", 1 },
|
||||
{ "jann", -1 },
|
||||
{ "jano", 1 },
|
||||
{ "janp", -1 },
|
||||
{ "janq", -1 },
|
||||
{ "janr", 1 },
|
||||
{ "janu", -1 },
|
||||
{ "july", 7 },
|
||||
{ "jun", 6 },
|
||||
{ "jun.", 6 },
|
||||
{ "june", 6 }
|
||||
};
|
||||
checkIterator(iter, data, LENGTHOF(data));
|
||||
// Reset, and we should get the same result.
|
||||
logln("after iter.reset()");
|
||||
checkIterator(iter.reset(), data, LENGTHOF(data));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestTruncatingIteratorFromLinearMatchShort() {
|
||||
static const StringAndValue data[]={
|
||||
{ "abcdef", 10 },
|
||||
{ "abcdepq", 200 },
|
||||
{ "abcdeyz", 3000 }
|
||||
};
|
||||
ByteTrieBuilder builder;
|
||||
StringPiece sp=buildTrie(data, LENGTHOF(data), builder, UDICTTRIE_BUILD_FAST);
|
||||
if(sp.empty()) {
|
||||
return; // buildTrie() reported an error
|
||||
}
|
||||
ByteTrie trie(sp.data());
|
||||
// Go into a linear-match node.
|
||||
trie.next('a');
|
||||
trie.next('b');
|
||||
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchShort()");
|
||||
// Truncate within the linear-match node.
|
||||
ByteTrieIterator iter(trie, 2, errorCode);
|
||||
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
|
||||
return;
|
||||
}
|
||||
static const StringAndValue expected[]={
|
||||
{ "cd", -1 }
|
||||
};
|
||||
checkIterator(iter, expected, LENGTHOF(expected));
|
||||
// Reset, and we should get the same result.
|
||||
logln("after iter.reset()");
|
||||
checkIterator(iter.reset(), expected, LENGTHOF(expected));
|
||||
}
|
||||
|
||||
void ByteTrieTest::TestTruncatingIteratorFromLinearMatchLong() {
|
||||
static const StringAndValue data[]={
|
||||
{ "abcdef", 10 },
|
||||
{ "abcdepq", 200 },
|
||||
{ "abcdeyz", 3000 }
|
||||
};
|
||||
ByteTrieBuilder builder;
|
||||
StringPiece sp=buildTrie(data, LENGTHOF(data), builder, UDICTTRIE_BUILD_FAST);
|
||||
if(sp.empty()) {
|
||||
return; // buildTrie() reported an error
|
||||
}
|
||||
ByteTrie trie(sp.data());
|
||||
// Go into a linear-match node.
|
||||
trie.next('a');
|
||||
trie.next('b');
|
||||
trie.next('c');
|
||||
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchLong()");
|
||||
// Truncate after the linear-match node.
|
||||
ByteTrieIterator iter(trie, 3, errorCode);
|
||||
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
|
||||
return;
|
||||
}
|
||||
static const StringAndValue expected[]={
|
||||
{ "def", 10 },
|
||||
{ "dep", -1 },
|
||||
{ "dey", -1 }
|
||||
};
|
||||
checkIterator(iter, expected, LENGTHOF(expected));
|
||||
// Reset, and we should get the same result.
|
||||
logln("after iter.reset()");
|
||||
checkIterator(iter.reset(), expected, LENGTHOF(expected));
|
||||
}
|
||||
|
||||
void ByteTrieTest::checkData(const StringAndValue data[], int32_t dataLength) {
|
||||
logln("checkData(dataLength=%d, fast)", (int)dataLength);
|
||||
checkData(data, dataLength, UDICTTRIE_BUILD_FAST);
|
||||
logln("checkData(dataLength=%d, small)", (int)dataLength);
|
||||
checkData(data, dataLength, UDICTTRIE_BUILD_SMALL);
|
||||
}
|
||||
|
||||
void ByteTrieTest::checkData(const StringAndValue data[], int32_t dataLength, UDictTrieBuildOption buildOption) {
|
||||
ByteTrieBuilder builder;
|
||||
StringPiece sp=buildTrie(data, dataLength, builder, buildOption);
|
||||
if(sp.empty()) {
|
||||
return; // buildTrie() reported an error
|
||||
}
|
||||
checkFirst(sp, data, dataLength);
|
||||
checkNext(sp, data, dataLength);
|
||||
checkNextWithState(sp, data, dataLength);
|
||||
checkNextString(sp, data, dataLength);
|
||||
checkIterator(sp, data, dataLength);
|
||||
}
|
||||
|
||||
StringPiece ByteTrieTest::buildTrie(const StringAndValue data[], int32_t dataLength,
|
||||
ByteTrieBuilder &builder, UDictTrieBuildOption buildOption) {
|
||||
IcuTestErrorCode errorCode(*this, "buildTrie()");
|
||||
// Add the items to the trie builder in an interesting (not trivial, not random) order.
|
||||
int32_t index, step;
|
||||
if(dataLength&1) {
|
||||
// Odd number of items.
|
||||
index=dataLength/2;
|
||||
step=2;
|
||||
} else if((dataLength%3)!=0) {
|
||||
// Not a multiple of 3.
|
||||
index=dataLength/5;
|
||||
step=3;
|
||||
} else {
|
||||
index=dataLength-1;
|
||||
step=-1;
|
||||
}
|
||||
builder.clear();
|
||||
for(int32_t i=0; i<dataLength; ++i) {
|
||||
builder.add(data[index].s, data[index].value, errorCode);
|
||||
index=(index+step)%dataLength;
|
||||
}
|
||||
StringPiece sp(builder.build(buildOption, errorCode));
|
||||
if(!errorCode.logIfFailureAndReset("add()/build()")) {
|
||||
builder.add("zzz", 999, errorCode);
|
||||
if(errorCode.reset()!=U_NO_WRITE_PERMISSION) {
|
||||
errln("builder.build().add(zzz) did not set U_NO_WRITE_PERMISSION");
|
||||
}
|
||||
}
|
||||
logln("serialized trie size: %ld bytes\n", (long)sp.length());
|
||||
return sp;
|
||||
}
|
||||
|
||||
void ByteTrieTest::checkFirst(const StringPiece &trieBytes,
|
||||
const StringAndValue data[], int32_t dataLength) {
|
||||
ByteTrie trie(trieBytes.data());
|
||||
for(int32_t i=0; i<dataLength; ++i) {
|
||||
int c=(uint8_t)*data[i].s;
|
||||
if(c==0) {
|
||||
continue; // skip empty string
|
||||
}
|
||||
UDictTrieResult firstResult=trie.first(c);
|
||||
int32_t firstValue=UDICTTRIE_RESULT_HAS_VALUE(firstResult) ? trie.getValue() : -1;
|
||||
UDictTrieResult nextResult=trie.next((uint8_t)data[i].s[1]);
|
||||
if(firstResult!=trie.reset().next(c) ||
|
||||
firstResult!=trie.current() ||
|
||||
firstValue!=(UDICTTRIE_RESULT_HAS_VALUE(firstResult) ? trie.getValue() : -1) ||
|
||||
nextResult!=trie.next((uint8_t)data[i].s[1])
|
||||
) {
|
||||
errln("trie.first(%c)!=trie.reset().next(same) for %s",
|
||||
c, data[i].s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ByteTrieTest::checkNext(const StringPiece &trieBytes,
|
||||
const StringAndValue data[], int32_t dataLength) {
|
||||
ByteTrie trie(trieBytes.data());
|
||||
ByteTrie::State state;
|
||||
for(int32_t i=0; i<dataLength; ++i) {
|
||||
int32_t stringLength= (i&1) ? -1 : strlen(data[i].s);
|
||||
UDictTrieResult result;
|
||||
if( !UDICTTRIE_RESULT_HAS_VALUE(result=trie.next(data[i].s, stringLength)) ||
|
||||
result!=trie.current()
|
||||
) {
|
||||
errln("trie does not seem to contain %s", data[i].s);
|
||||
} else if(trie.getValue()!=data[i].value) {
|
||||
errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx",
|
||||
data[i].s,
|
||||
(long)trie.getValue(), (long)trie.getValue(),
|
||||
(long)data[i].value, (long)data[i].value);
|
||||
} else if(result!=trie.current() || trie.getValue()!=data[i].value) {
|
||||
errln("trie value for %s changes when repeating current()/getValue()", data[i].s);
|
||||
}
|
||||
trie.reset();
|
||||
stringLength=strlen(data[i].s);
|
||||
result=trie.current();
|
||||
for(int32_t j=0; j<stringLength; ++j) {
|
||||
if(!UDICTTRIE_RESULT_HAS_NEXT(result)) {
|
||||
errln("trie.current()!=hasNext before end of %s (at index %d)", data[i].s, j);
|
||||
break;
|
||||
}
|
||||
if(result==UDICTTRIE_HAS_VALUE) {
|
||||
trie.getValue();
|
||||
if(trie.current()!=UDICTTRIE_HAS_VALUE) {
|
||||
errln("trie.getValue().current()!=UDICTTRIE_HAS_VALUE before end of %s (at index %d)", data[i].s, j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
result=trie.next(data[i].s[j]);
|
||||
if(!UDICTTRIE_RESULT_MATCHES(result)) {
|
||||
errln("trie.next()=UDICTTRIE_NO_MATCH before end of %s (at index %d)", data[i].s, j);
|
||||
break;
|
||||
}
|
||||
if(result!=trie.current()) {
|
||||
errln("trie.next()!=following current() before end of %s (at index %d)", data[i].s, j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!UDICTTRIE_RESULT_HAS_VALUE(result)) {
|
||||
errln("trie.next()!=hasValue at the end of %s", data[i].s);
|
||||
continue;
|
||||
}
|
||||
trie.getValue();
|
||||
if(result!=trie.current()) {
|
||||
errln("trie.current() != current()+getValue()+current() after end of %s",
|
||||
data[i].s);
|
||||
}
|
||||
// Compare the final current() with whether next() can actually continue.
|
||||
trie.saveState(state);
|
||||
UBool nextContinues=FALSE;
|
||||
for(int32_t c=0x20; c<0x7f; ++c) {
|
||||
if(trie.resetToState(state).next(c)) {
|
||||
nextContinues=TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if((result==UDICTTRIE_HAS_VALUE)!=nextContinues) {
|
||||
errln("(trie.current()==UDICTTRIE_HAS_VALUE) contradicts "
|
||||
"(trie.next(some UChar)!=UDICTTRIE_NO_MATCH) after end of %s", data[i].s);
|
||||
}
|
||||
trie.reset();
|
||||
}
|
||||
}
|
||||
|
||||
void ByteTrieTest::checkNextWithState(const StringPiece &trieBytes,
|
||||
const StringAndValue data[], int32_t dataLength) {
|
||||
ByteTrie trie(trieBytes.data());
|
||||
ByteTrie::State noState, state;
|
||||
for(int32_t i=0; i<dataLength; ++i) {
|
||||
if((i&1)==0) {
|
||||
// This should have no effect.
|
||||
trie.resetToState(noState);
|
||||
}
|
||||
const char *expectedString=data[i].s;
|
||||
int32_t stringLength=strlen(expectedString);
|
||||
int32_t partialLength=stringLength/3;
|
||||
for(int32_t j=0; j<partialLength; ++j) {
|
||||
if(!UDICTTRIE_RESULT_MATCHES(trie.next(expectedString[j]))) {
|
||||
errln("trie.next()=UDICTTRIE_NO_MATCH for a prefix of %s", data[i].s);
|
||||
return;
|
||||
}
|
||||
}
|
||||
trie.saveState(state);
|
||||
UDictTrieResult resultAtState=trie.current();
|
||||
UDictTrieResult result;
|
||||
int32_t valueAtState=-99;
|
||||
if(UDICTTRIE_RESULT_HAS_VALUE(resultAtState)) {
|
||||
valueAtState=trie.getValue();
|
||||
}
|
||||
result=trie.next(0); // mismatch
|
||||
if(result!=UDICTTRIE_NO_MATCH || result!=trie.current()) {
|
||||
errln("trie.next(0) matched after part of %s", data[i].s);
|
||||
}
|
||||
if( resultAtState!=trie.resetToState(state).current() ||
|
||||
(UDICTTRIE_RESULT_HAS_VALUE(resultAtState) && valueAtState!=trie.getValue())
|
||||
) {
|
||||
errln("trie.next(part of %s) changes current()/getValue() after "
|
||||
"saveState/next(0)/resetToState",
|
||||
data[i].s);
|
||||
} else if(!UDICTTRIE_RESULT_HAS_VALUE(
|
||||
result=trie.next(expectedString+partialLength,
|
||||
stringLength-partialLength)) ||
|
||||
result!=trie.current()) {
|
||||
errln("trie.next(rest of %s) does not seem to contain %s after "
|
||||
"saveState/next(0)/resetToState",
|
||||
data[i].s);
|
||||
} else if(!UDICTTRIE_RESULT_HAS_VALUE(
|
||||
result=trie.resetToState(state).
|
||||
next(expectedString+partialLength,
|
||||
stringLength-partialLength)) ||
|
||||
result!=trie.current()) {
|
||||
errln("trie does not seem to contain %s after saveState/next(rest)/resetToState",
|
||||
data[i].s);
|
||||
} else if(trie.getValue()!=data[i].value) {
|
||||
errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx",
|
||||
data[i].s,
|
||||
(long)trie.getValue(), (long)trie.getValue(),
|
||||
(long)data[i].value, (long)data[i].value);
|
||||
}
|
||||
trie.reset();
|
||||
}
|
||||
}
|
||||
|
||||
// next(string) is also tested in other functions,
|
||||
// but here we try to go partway through the string, and then beyond it.
|
||||
void ByteTrieTest::checkNextString(const StringPiece &trieBytes,
|
||||
const StringAndValue data[], int32_t dataLength) {
|
||||
ByteTrie trie(trieBytes.data());
|
||||
for(int32_t i=0; i<dataLength; ++i) {
|
||||
const char *expectedString=data[i].s;
|
||||
int32_t stringLength=strlen(expectedString);
|
||||
if(!trie.next(expectedString, stringLength/2)) {
|
||||
errln("trie.next(up to middle of string)=UDICTTRIE_NO_MATCH for %s", data[i].s);
|
||||
continue;
|
||||
}
|
||||
// Test that we stop properly at the end of the string.
|
||||
if(trie.next(expectedString+stringLength/2, stringLength+1-stringLength/2)) {
|
||||
errln("trie.next(string+NUL)!=UDICTTRIE_NO_MATCH for %s", data[i].s);
|
||||
}
|
||||
trie.reset();
|
||||
}
|
||||
}
|
||||
|
||||
void ByteTrieTest::checkIterator(const StringPiece &trieBytes,
|
||||
const StringAndValue data[], int32_t dataLength) {
|
||||
IcuTestErrorCode errorCode(*this, "checkIterator()");
|
||||
ByteTrieIterator iter(trieBytes.data(), 0, errorCode);
|
||||
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trieBytes) constructor")) {
|
||||
return;
|
||||
}
|
||||
checkIterator(iter, data, dataLength);
|
||||
}
|
||||
|
||||
void ByteTrieTest::checkIterator(ByteTrieIterator &iter,
|
||||
const StringAndValue data[], int32_t dataLength) {
|
||||
IcuTestErrorCode errorCode(*this, "checkIterator()");
|
||||
for(int32_t i=0; i<dataLength; ++i) {
|
||||
if(!iter.hasNext()) {
|
||||
errln("trie iterator hasNext()=FALSE for item %d: %s", (int)i, data[i].s);
|
||||
break;
|
||||
}
|
||||
UBool hasNext=iter.next(errorCode);
|
||||
if(errorCode.logIfFailureAndReset("trie iterator next() for item %d: %s", (int)i, data[i].s)) {
|
||||
break;
|
||||
}
|
||||
if(!hasNext) {
|
||||
errln("trie iterator next()=FALSE for item %d: %s", (int)i, data[i].s);
|
||||
break;
|
||||
}
|
||||
if(iter.getString()!=StringPiece(data[i].s)) {
|
||||
errln("trie iterator next().getString()=%s but expected %s for item %d",
|
||||
iter.getString().data(), data[i].s, (int)i);
|
||||
}
|
||||
if(iter.getValue()!=data[i].value) {
|
||||
errln("trie iterator next().getValue()=%ld=0x%lx but expected %ld=0x%lx for item %d: %s",
|
||||
(long)iter.getValue(), (long)iter.getValue(),
|
||||
(long)data[i].value, (long)data[i].value,
|
||||
(int)i, data[i].s);
|
||||
}
|
||||
}
|
||||
if(iter.hasNext()) {
|
||||
errln("trie iterator hasNext()=TRUE after all items");
|
||||
}
|
||||
UBool hasNext=iter.next(errorCode);
|
||||
errorCode.logIfFailureAndReset("trie iterator next() after all items");
|
||||
if(hasNext) {
|
||||
errln("trie iterator next()=TRUE after all items");
|
||||
}
|
||||
}
|
|
@ -223,6 +223,8 @@
|
|||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="bytetrietest.cpp" />
|
||||
<ClCompile Include="uchartrietest.cpp" />
|
||||
<ClCompile Include="itrbbi.cpp" />
|
||||
<ClCompile Include="rbbiapts.cpp" />
|
||||
<ClCompile Include="rbbitst.cpp" />
|
||||
|
@ -529,4 +531,4 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
|
@ -29,7 +29,9 @@
|
|||
#include "aliastst.h"
|
||||
#include "usettest.h"
|
||||
|
||||
extern IntlTest *createByteTrieTest();
|
||||
static IntlTest *createLocalPointerTest();
|
||||
extern IntlTest *createUCharTrieTest();
|
||||
|
||||
#define CASE(id, test) case id: \
|
||||
name = #test; \
|
||||
|
@ -68,6 +70,22 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &
|
|||
callTest(*test, par);
|
||||
}
|
||||
break;
|
||||
case 17:
|
||||
name = "ByteTrieTest";
|
||||
if (exec) {
|
||||
logln("TestSuite ByteTrieTest---"); logln();
|
||||
LocalPointer<IntlTest> test(createByteTrieTest());
|
||||
callTest(*test, par);
|
||||
}
|
||||
break;
|
||||
case 18:
|
||||
name = "UCharTrieTest";
|
||||
if (exec) {
|
||||
logln("TestSuite UCharTrieTest---"); logln();
|
||||
LocalPointer<IntlTest> test(createUCharTrieTest());
|
||||
callTest(*test, par);
|
||||
}
|
||||
break;
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
}
|
||||
|
|
1038
icu4c/source/test/intltest/uchartrietest.cpp
Normal file
1038
icu4c/source/test/intltest/uchartrietest.cpp
Normal file
File diff suppressed because it is too large
Load diff
|
@ -18,7 +18,7 @@ subdir = test/perf
|
|||
## Files to remove for 'make clean'
|
||||
CLEANFILES = *~
|
||||
|
||||
SUBDIRS = collationperf charperf normperf ubrkperf unisetperf usetperf ustrperf utfperf utrie2perf DateFmtPerf
|
||||
SUBDIRS = collationperf charperf dicttrieperf normperf ubrkperf unisetperf usetperf ustrperf utfperf utrie2perf DateFmtPerf
|
||||
|
||||
# Subdirs that support 'xperf'
|
||||
XSUBDIRS = DateFmtPerf
|
||||
|
|
79
icu4c/source/test/perf/dicttrieperf/Makefile.in
Normal file
79
icu4c/source/test/perf/dicttrieperf/Makefile.in
Normal file
|
@ -0,0 +1,79 @@
|
|||
## Makefile.in for ICU - test/perf/dicttrieperf
|
||||
## Copyright (c) 2001-2010, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
## Build directory information
|
||||
subdir = test/perf/dicttrieperf
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(DEPS)
|
||||
|
||||
## Target information
|
||||
TARGET = dicttrieperf
|
||||
|
||||
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/tools/toolutil -I$(top_srcdir)/tools/ctestfw
|
||||
LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUUC) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = dicttrieperf.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check check-local
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET)
|
||||
|
||||
install-local:
|
||||
|
||||
dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(OBJECTS) $(TARGET)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.cc) -o $@ $^ $(LIBS)
|
||||
$(POST_BUILD_STEP)
|
||||
|
||||
invoke:
|
||||
ICU_DATA=$${ICU_DATA:-$(top_builddir)/data/} TZ=PST8PDT $(INVOKE) $(INVOCATION)
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
ifneq ($(patsubst %install,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
766
icu4c/source/test/perf/dicttrieperf/dicttrieperf.cpp
Normal file
766
icu4c/source/test/perf/dicttrieperf/dicttrieperf.cpp
Normal file
|
@ -0,0 +1,766 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: dicttrieperf.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010dec09
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Performance test program for dictionary-type tries.
|
||||
*
|
||||
* Usage from within <ICU build tree>/test/perf/dicttrieperf/ :
|
||||
* (Linux)
|
||||
* make
|
||||
* export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw
|
||||
* ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000
|
||||
* or
|
||||
* ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/uperf.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "bytetrie.h"
|
||||
#include "bytetriebuilder.h"
|
||||
#include "charstr.h"
|
||||
#include "package.h"
|
||||
#include "toolutil.h"
|
||||
#include "triedict.h"
|
||||
#include "ucbuf.h" // struct ULine
|
||||
#include "uchartrie.h"
|
||||
#include "uchartriebuilder.h"
|
||||
#include "uoptions.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
// Test object.
|
||||
class DictionaryTriePerfTest : public UPerfTest {
|
||||
public:
|
||||
DictionaryTriePerfTest(int32_t argc, const char *argv[], UErrorCode &status)
|
||||
: UPerfTest(argc, argv, NULL, 0, "", status), numTextLines(0) {
|
||||
if(hasFile()) {
|
||||
getLines(status);
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]>=0x41) {
|
||||
++numTextLines;
|
||||
// Remove trailing CR LF.
|
||||
int32_t len=lines[i].len;
|
||||
UChar c;
|
||||
while(len>0 && ((c=lines[i].name[len-1])==0xa || c==0xd)) {
|
||||
--len;
|
||||
}
|
||||
lines[i].len=len;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual UPerfFunction *runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
|
||||
|
||||
const char *getSourceDir() const { return sourceDir; }
|
||||
|
||||
UBool hasFile() const { return ucharBuf!=NULL; }
|
||||
const ULine *getCachedLines() const { return lines; }
|
||||
int32_t getNumLines() const { return numLines; }
|
||||
int32_t numTextLines; // excluding comment lines
|
||||
};
|
||||
|
||||
// Performance test function object.
|
||||
// Loads icudt46l.dat (or whatever its current versioned filename)
|
||||
// from the -s or --sourcedir path.
|
||||
class PackageLookup : public UPerfFunction {
|
||||
protected:
|
||||
PackageLookup(const DictionaryTriePerfTest &perf) {
|
||||
IcuToolErrorCode errorCode("PackageLookup()");
|
||||
CharString filename(perf.getSourceDir(), errorCode);
|
||||
int32_t filenameLength=filename.length();
|
||||
if(filenameLength>0 && filename[filenameLength-1]!=U_FILE_SEP_CHAR &&
|
||||
filename[filenameLength-1]!=U_FILE_ALT_SEP_CHAR) {
|
||||
filename.append(U_FILE_SEP_CHAR, errorCode);
|
||||
}
|
||||
filename.append(U_ICUDATA_NAME, errorCode);
|
||||
filename.append(".dat", errorCode);
|
||||
pkg.readPackage(filename.data());
|
||||
}
|
||||
|
||||
public:
|
||||
virtual ~PackageLookup() {}
|
||||
|
||||
// virtual void call(UErrorCode* pErrorCode) { ... }
|
||||
|
||||
virtual long getOperationsPerIteration() {
|
||||
return pkg.getItemCount();
|
||||
}
|
||||
|
||||
// virtual long getEventsPerIteration();
|
||||
|
||||
protected:
|
||||
Package pkg;
|
||||
};
|
||||
|
||||
struct TOCEntry {
|
||||
int32_t nameOffset, dataOffset;
|
||||
};
|
||||
|
||||
// Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c).
|
||||
static int32_t simpleBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) {
|
||||
int32_t start=0;
|
||||
int32_t limit=count;
|
||||
int32_t lastNumber=limit;
|
||||
for(;;) {
|
||||
int32_t number=(start+limit)/2;
|
||||
if(lastNumber==number) { // have we moved?
|
||||
return -1; // not found
|
||||
}
|
||||
lastNumber=number;
|
||||
int32_t cmp=strcmp(s, names+toc[number].nameOffset);
|
||||
if(cmp<0) {
|
||||
limit=number;
|
||||
} else if(cmp>0) {
|
||||
start=number;
|
||||
} else { // found s
|
||||
return number;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class BinarySearchPackageLookup : public PackageLookup {
|
||||
public:
|
||||
BinarySearchPackageLookup(const DictionaryTriePerfTest &perf)
|
||||
: PackageLookup(perf) {
|
||||
IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
|
||||
int32_t count=pkg.getItemCount();
|
||||
toc=new TOCEntry[count];
|
||||
for(int32_t i=0; i<count; ++i) {
|
||||
toc[i].nameOffset=itemNames.length();
|
||||
toc[i].dataOffset=i; // arbitrary value, see toc comment below
|
||||
// The Package class removes the "icudt46l/" prefix.
|
||||
// We restore that here for a fair performance test.
|
||||
const char *name=pkg.getItem(i)->name;
|
||||
itemNames.append("icudt46l/", errorCode);
|
||||
itemNames.append(name, strlen(name)+1, errorCode);
|
||||
}
|
||||
printf("size of item names: %6ld\n", (long)itemNames.length());
|
||||
printf("size of TOC: %6ld\n", (long)(count*8));
|
||||
printf("total index size: %6ld\n", (long)(itemNames.length()+count*8));
|
||||
}
|
||||
virtual ~BinarySearchPackageLookup() {
|
||||
delete[] toc;
|
||||
}
|
||||
|
||||
virtual void call(UErrorCode * /*pErrorCode*/) {
|
||||
int32_t count=pkg.getItemCount();
|
||||
const char *itemNameChars=itemNames.data();
|
||||
const char *name=itemNameChars;
|
||||
for(int32_t i=0; i<count; ++i) {
|
||||
if(simpleBinarySearch(name, itemNameChars, toc, count)<0) {
|
||||
fprintf(stderr, "item not found: %s\n", name);
|
||||
}
|
||||
name=strchr(name, 0)+1;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
CharString itemNames;
|
||||
// toc imitates a .dat file's array of UDataOffsetTOCEntry
|
||||
// with nameOffset and dataOffset.
|
||||
// We don't need the dataOffsets, but we want to imitate the real
|
||||
// memory density, to measure equivalent CPU cache usage.
|
||||
TOCEntry *toc;
|
||||
};
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(a,b) (((a)<(b)) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
// Compare strings where we know the shared prefix length,
|
||||
// and advance the prefix length as we find that the strings share even more characters.
|
||||
static int32_t strcmpAfterPrefix(const char *s1, const char *s2, int32_t &prefixLength) {
|
||||
int32_t pl=prefixLength;
|
||||
s1+=pl;
|
||||
s2+=pl;
|
||||
int32_t cmp=0;
|
||||
for(;;) {
|
||||
int32_t c1=(uint8_t)*s1++;
|
||||
int32_t c2=(uint8_t)*s2++;
|
||||
cmp=c1-c2;
|
||||
if(cmp!=0 || c1==0) { // different or done
|
||||
break;
|
||||
}
|
||||
++pl; // increment shared same-prefix length
|
||||
}
|
||||
prefixLength=pl;
|
||||
return cmp;
|
||||
}
|
||||
|
||||
static int32_t prefixBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) {
|
||||
if(count==0) {
|
||||
return -1;
|
||||
}
|
||||
int32_t start=0;
|
||||
int32_t limit=count;
|
||||
// Remember the shared prefix between s, start and limit,
|
||||
// and don't compare that shared prefix again.
|
||||
// The shared prefix should get longer as we narrow the [start, limit[ range.
|
||||
int32_t startPrefixLength=0;
|
||||
int32_t limitPrefixLength=0;
|
||||
// Prime the prefix lengths so that we don't keep prefixLength at 0 until
|
||||
// both the start and limit indexes have moved.
|
||||
// At the same time, we find if s is one of the start and (limit-1) names,
|
||||
// and if not, exclude them from the actual binary search.
|
||||
if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, startPrefixLength)) {
|
||||
return 0;
|
||||
}
|
||||
++start;
|
||||
--limit;
|
||||
if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, limitPrefixLength)) {
|
||||
return limit;
|
||||
}
|
||||
while(start<limit) {
|
||||
int32_t i=(start+limit)/2;
|
||||
int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength);
|
||||
int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, prefixLength);
|
||||
if(cmp<0) {
|
||||
limit=i;
|
||||
limitPrefixLength=prefixLength;
|
||||
} else if(cmp==0) {
|
||||
return i;
|
||||
} else {
|
||||
start=i;
|
||||
startPrefixLength=prefixLength;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
class PrefixBinarySearchPackageLookup : public BinarySearchPackageLookup {
|
||||
public:
|
||||
PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest &perf)
|
||||
: BinarySearchPackageLookup(perf) {}
|
||||
|
||||
virtual void call(UErrorCode * /*pErrorCode*/) {
|
||||
int32_t count=pkg.getItemCount();
|
||||
const char *itemNameChars=itemNames.data();
|
||||
const char *name=itemNameChars;
|
||||
for(int32_t i=0; i<count; ++i) {
|
||||
if(prefixBinarySearch(name, itemNameChars, toc, count)<0) {
|
||||
fprintf(stderr, "item not found: %s\n", name);
|
||||
}
|
||||
name=strchr(name, 0)+1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static int32_t byteTrieLookup(const char *s, const char *nameTrieBytes) {
|
||||
ByteTrie trie(nameTrieBytes);
|
||||
if(UDICTTRIE_RESULT_HAS_VALUE(trie.next(s, -1))) {
|
||||
return trie.getValue();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
class ByteTriePackageLookup : public PackageLookup {
|
||||
public:
|
||||
ByteTriePackageLookup(const DictionaryTriePerfTest &perf)
|
||||
: PackageLookup(perf) {
|
||||
IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
|
||||
int32_t count=pkg.getItemCount();
|
||||
for(int32_t i=0; i<count; ++i) {
|
||||
// The Package class removes the "icudt46l/" prefix.
|
||||
// We restore that here for a fair performance test.
|
||||
// We store all full names so that we do not have to reconstruct them
|
||||
// in the call() function.
|
||||
const char *name=pkg.getItem(i)->name;
|
||||
int32_t offset=itemNames.length();
|
||||
itemNames.append("icudt46l/", errorCode);
|
||||
itemNames.append(name, -1, errorCode);
|
||||
// As value, set the data item index.
|
||||
// In a real implementation, we would use that to get the
|
||||
// start and limit offset of the data item.
|
||||
StringPiece fullName(itemNames.toStringPiece());
|
||||
fullName.remove_prefix(offset);
|
||||
builder.add(fullName, i, errorCode);
|
||||
// NUL-terminate the name for call() to find the next one.
|
||||
itemNames.append(0, errorCode);
|
||||
}
|
||||
int32_t length=builder.build(UDICTTRIE_BUILD_SMALL, errorCode).length();
|
||||
printf("size of ByteTrie: %6ld\n", (long)length);
|
||||
// count+1: +1 for the last-item limit offset which we should have always had
|
||||
printf("size of dataOffsets:%6ld\n", (long)((count+1)*4));
|
||||
printf("total index size: %6ld\n", (long)(length+(count+1)*4));
|
||||
}
|
||||
virtual ~ByteTriePackageLookup() {}
|
||||
|
||||
virtual void call(UErrorCode *pErrorCode) {
|
||||
int32_t count=pkg.getItemCount();
|
||||
const char *nameTrieBytes=builder.build(UDICTTRIE_BUILD_SMALL, *pErrorCode).data();
|
||||
const char *name=itemNames.data();
|
||||
for(int32_t i=0; i<count; ++i) {
|
||||
if(byteTrieLookup(name, nameTrieBytes)<0) {
|
||||
fprintf(stderr, "item not found: %s\n", name);
|
||||
}
|
||||
name=strchr(name, 0)+1;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
ByteTrieBuilder builder;
|
||||
CharString itemNames;
|
||||
};
|
||||
|
||||
// Performance test function object.
|
||||
// Each subclass loads a dictionary text file
|
||||
// from the -s or --sourcedir path plus -f or --file-name.
|
||||
// For example, <ICU source dir>/source/data/brkitr/thaidict.txt.
|
||||
class DictLookup : public UPerfFunction {
|
||||
public:
|
||||
DictLookup(const DictionaryTriePerfTest &perfTest) : perf(perfTest) {}
|
||||
|
||||
virtual long getOperationsPerIteration() {
|
||||
return perf.numTextLines;
|
||||
}
|
||||
|
||||
protected:
|
||||
const DictionaryTriePerfTest &perf;
|
||||
};
|
||||
|
||||
class CompactTrieDictLookup : public DictLookup {
|
||||
public:
|
||||
CompactTrieDictLookup(const DictionaryTriePerfTest &perfTest)
|
||||
: DictLookup(perfTest), ctd(NULL) {
|
||||
IcuToolErrorCode errorCode("UCharTrieDictLookup()");
|
||||
// U+0E1C is the median code unit, from
|
||||
// the UCharTrie root node (split-branch node) for thaidict.txt.
|
||||
MutableTrieDictionary builder(0xe1c, errorCode);
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
builder.addWord(lines[i].name, lines[i].len, errorCode);
|
||||
}
|
||||
ctd=new CompactTrieDictionary(builder, errorCode);
|
||||
int32_t length=(int32_t)ctd->dataSize();
|
||||
printf("size of CompactTrieDict: %6ld bytes\n", (long)length);
|
||||
}
|
||||
|
||||
virtual ~CompactTrieDictLookup() {
|
||||
delete ctd;
|
||||
}
|
||||
|
||||
virtual void call(UErrorCode *pErrorCode) {
|
||||
UText text=UTEXT_INITIALIZER;
|
||||
int32_t lengths[20];
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
|
||||
int32_t count;
|
||||
ctd->matches(&text, lines[i].len,
|
||||
lengths, count, LENGTHOF(lengths));
|
||||
if(count==0 || lengths[count-1]!=lines[i].len) {
|
||||
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
CompactTrieDictionary *ctd;
|
||||
};
|
||||
|
||||
// Closely imitate CompactTrieDictionary::matches().
|
||||
// Note: CompactTrieDictionary::matches() is part of its trie implementation,
|
||||
// and while it loops over the text, it knows the current state.
|
||||
// By contrast, this implementation uses UCharTrie API functions that have to
|
||||
// check the trie state each time and load/store state in the object.
|
||||
// (Whether it hasNext() and whether it is in the middle of a linear-match node.)
|
||||
static int32_t
|
||||
ucharTrieMatches(UCharTrie &trie,
|
||||
UText *text, int32_t textLimit,
|
||||
int32_t *lengths, int &count, int limit ) {
|
||||
UChar32 c=utext_next32(text);
|
||||
// Notes:
|
||||
// a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
|
||||
// b) It also ignores non-BMP code points by casting to UChar!
|
||||
if(c<0) {
|
||||
return 0;
|
||||
}
|
||||
// Should be firstForCodePoint() but CompactTrieDictionary
|
||||
// handles only code units.
|
||||
UDictTrieResult result=trie.first(c);
|
||||
int32_t numChars=1;
|
||||
count=0;
|
||||
for(;;) {
|
||||
if(UDICTTRIE_RESULT_HAS_VALUE(result)) {
|
||||
if(count<limit) {
|
||||
// lengths[count++]=(int32_t)utext_getNativeIndex(text);
|
||||
lengths[count++]=numChars; // CompactTrieDictionary just counts chars too.
|
||||
}
|
||||
if(result==UDICTTRIE_HAS_FINAL_VALUE) {
|
||||
break;
|
||||
}
|
||||
} else if(result==UDICTTRIE_NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
if(numChars>=textLimit) {
|
||||
// Note: Why do we have both a text limit and a UText that knows its length?
|
||||
break;
|
||||
}
|
||||
UChar32 c=utext_next32(text);
|
||||
// Notes:
|
||||
// a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
|
||||
// b) It also ignores non-BMP code points by casting to UChar!
|
||||
if(c<0) {
|
||||
break;
|
||||
}
|
||||
++numChars;
|
||||
// Should be nextForCodePoint() but CompactTrieDictionary
|
||||
// handles only code units.
|
||||
result=trie.next(c);
|
||||
}
|
||||
#if 0
|
||||
// Note: CompactTrieDictionary::matches() comments say that it leaves the UText
|
||||
// after the longest prefix match and returns the number of characters
|
||||
// that were matched.
|
||||
if(index!=lastMatch) {
|
||||
utext_setNativeIndex(text, lastMatch);
|
||||
}
|
||||
return lastMatch-start;
|
||||
// However, it does not do either of these, so I am not trying to
|
||||
// imitate it (or its docs) 100%.
|
||||
#endif
|
||||
return numChars;
|
||||
}
|
||||
|
||||
class UCharTrieDictLookup : public DictLookup {
|
||||
public:
|
||||
UCharTrieDictLookup(const DictionaryTriePerfTest &perfTest)
|
||||
: DictLookup(perfTest) {
|
||||
IcuToolErrorCode errorCode("UCharTrieDictLookup()");
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
builder.add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode);
|
||||
}
|
||||
UnicodeString trieUChars;
|
||||
int32_t length=builder.build(UDICTTRIE_BUILD_SMALL, trieUChars, errorCode).length();
|
||||
printf("size of UCharTrie: %6ld bytes\n", (long)length*2);
|
||||
}
|
||||
|
||||
virtual ~UCharTrieDictLookup() {}
|
||||
|
||||
protected:
|
||||
UCharTrieBuilder builder;
|
||||
};
|
||||
|
||||
class UCharTrieDictMatches : public UCharTrieDictLookup {
|
||||
public:
|
||||
UCharTrieDictMatches(const DictionaryTriePerfTest &perfTest)
|
||||
: UCharTrieDictLookup(perfTest) {}
|
||||
|
||||
virtual void call(UErrorCode *pErrorCode) {
|
||||
UnicodeString uchars;
|
||||
UCharTrie trie(builder.build(UDICTTRIE_BUILD_SMALL, uchars, *pErrorCode).getBuffer());
|
||||
UText text=UTEXT_INITIALIZER;
|
||||
int32_t lengths[20];
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
|
||||
int32_t count=0;
|
||||
ucharTrieMatches(trie, &text, lines[i].len,
|
||||
lengths, count, LENGTHOF(lengths));
|
||||
if(count==0 || lengths[count-1]!=lines[i].len) {
|
||||
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class UCharTrieDictContains : public UCharTrieDictLookup {
|
||||
public:
|
||||
UCharTrieDictContains(const DictionaryTriePerfTest &perfTest)
|
||||
: UCharTrieDictLookup(perfTest) {}
|
||||
|
||||
virtual void call(UErrorCode *pErrorCode) {
|
||||
UnicodeString uchars;
|
||||
UCharTrie trie(builder.build(UDICTTRIE_BUILD_SMALL, uchars, *pErrorCode).getBuffer());
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
if(!UDICTTRIE_RESULT_HAS_VALUE(trie.reset().next(lines[i].name, lines[i].len))) {
|
||||
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static inline int32_t thaiCharToByte(UChar32 c) {
|
||||
if(0xe00<=c && c<=0xefe) {
|
||||
return c&0xff;
|
||||
} else if(c==0x2e) {
|
||||
return 0xff;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static UBool thaiWordToBytes(const UChar *s, int32_t length,
|
||||
CharString &str, UErrorCode &errorCode) {
|
||||
for(int32_t i=0; i<length; ++i) {
|
||||
UChar c=s[i];
|
||||
int32_t b=thaiCharToByte(c);
|
||||
if(b>=0) {
|
||||
str.append((char)b, errorCode);
|
||||
} else {
|
||||
fprintf(stderr, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
class ByteTrieDictLookup : public DictLookup {
|
||||
public:
|
||||
ByteTrieDictLookup(const DictionaryTriePerfTest &perfTest)
|
||||
: DictLookup(perfTest), noDict(FALSE) {
|
||||
IcuToolErrorCode errorCode("ByteTrieDictLookup()");
|
||||
CharString str;
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
if(!thaiWordToBytes(lines[i].name, lines[i].len, str.clear(), errorCode)) {
|
||||
fprintf(stderr, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i);
|
||||
noDict=TRUE;
|
||||
break;
|
||||
}
|
||||
builder.add(str.toStringPiece(), 0, errorCode);
|
||||
}
|
||||
if(!noDict) {
|
||||
int32_t length=builder.build(UDICTTRIE_BUILD_SMALL, errorCode).length();
|
||||
printf("size of ByteTrie: %6ld bytes\n", (long)length);
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~ByteTrieDictLookup() {}
|
||||
|
||||
protected:
|
||||
ByteTrieBuilder builder;
|
||||
UBool noDict;
|
||||
};
|
||||
|
||||
static int32_t
|
||||
byteTrieMatches(ByteTrie &trie,
|
||||
UText *text, int32_t textLimit,
|
||||
int32_t *lengths, int &count, int limit ) {
|
||||
UChar32 c=utext_next32(text);
|
||||
if(c<0) {
|
||||
return 0;
|
||||
}
|
||||
UDictTrieResult result=trie.first(thaiCharToByte(c));
|
||||
int32_t numChars=1;
|
||||
count=0;
|
||||
for(;;) {
|
||||
if(UDICTTRIE_RESULT_HAS_VALUE(result)) {
|
||||
if(count<limit) {
|
||||
// lengths[count++]=(int32_t)utext_getNativeIndex(text);
|
||||
lengths[count++]=numChars; // CompactTrieDictionary just counts chars too.
|
||||
}
|
||||
if(result==UDICTTRIE_HAS_FINAL_VALUE) {
|
||||
break;
|
||||
}
|
||||
} else if(result==UDICTTRIE_NO_MATCH) {
|
||||
break;
|
||||
}
|
||||
if(numChars>=textLimit) {
|
||||
break;
|
||||
}
|
||||
UChar32 c=utext_next32(text);
|
||||
if(c<0) {
|
||||
break;
|
||||
}
|
||||
++numChars;
|
||||
result=trie.next(thaiCharToByte(c));
|
||||
}
|
||||
return numChars;
|
||||
}
|
||||
|
||||
class ByteTrieDictMatches : public ByteTrieDictLookup {
|
||||
public:
|
||||
ByteTrieDictMatches(const DictionaryTriePerfTest &perfTest)
|
||||
: ByteTrieDictLookup(perfTest) {}
|
||||
|
||||
virtual void call(UErrorCode *pErrorCode) {
|
||||
if(noDict) {
|
||||
return;
|
||||
}
|
||||
ByteTrie trie(builder.build(UDICTTRIE_BUILD_SMALL, *pErrorCode).data());
|
||||
UText text=UTEXT_INITIALIZER;
|
||||
int32_t lengths[20];
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(lines[i].name[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
|
||||
int32_t count=0;
|
||||
byteTrieMatches(trie, &text, lines[i].len,
|
||||
lengths, count, LENGTHOF(lengths));
|
||||
if(count==0 || lengths[count-1]!=lines[i].len) {
|
||||
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class ByteTrieDictContains : public ByteTrieDictLookup {
|
||||
public:
|
||||
ByteTrieDictContains(const DictionaryTriePerfTest &perfTest)
|
||||
: ByteTrieDictLookup(perfTest) {}
|
||||
|
||||
virtual void call(UErrorCode *pErrorCode) {
|
||||
if(noDict) {
|
||||
return;
|
||||
}
|
||||
ByteTrie trie(builder.build(UDICTTRIE_BUILD_SMALL, *pErrorCode).data());
|
||||
const ULine *lines=perf.getCachedLines();
|
||||
int32_t numLines=perf.getNumLines();
|
||||
for(int32_t i=0; i<numLines; ++i) {
|
||||
const UChar *line=lines[i].name;
|
||||
// Skip comment lines (start with a character below 'A').
|
||||
if(line[0]<0x41) {
|
||||
continue;
|
||||
}
|
||||
UDictTrieResult result=trie.first(thaiCharToByte(line[0]));
|
||||
int32_t lineLength=lines[i].len;
|
||||
for(int32_t j=1; j<lineLength; ++j) {
|
||||
if(!UDICTTRIE_RESULT_HAS_NEXT(result)) {
|
||||
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
|
||||
break;
|
||||
}
|
||||
result=trie.next(thaiCharToByte(line[j]));
|
||||
}
|
||||
if(!UDICTTRIE_RESULT_HAS_VALUE(result)) {
|
||||
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
UPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec,
|
||||
const char *&name, char * /*par*/) {
|
||||
if(hasFile()) {
|
||||
switch(index) {
|
||||
case 0:
|
||||
name="compacttriematches";
|
||||
if(exec) {
|
||||
return new CompactTrieDictLookup(*this);
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
name="uchartriematches";
|
||||
if(exec) {
|
||||
return new UCharTrieDictMatches(*this);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
name="uchartriecontains";
|
||||
if(exec) {
|
||||
return new UCharTrieDictContains(*this);
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
name="bytetriematches";
|
||||
if(exec) {
|
||||
return new ByteTrieDictMatches(*this);
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
name="bytetriecontains";
|
||||
if(exec) {
|
||||
return new ByteTrieDictContains(*this);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
name="";
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if(index==0 && exec) {
|
||||
puts("Running ByteTrie perf tests on the .dat package file from the --sourcedir.\n"
|
||||
"For UCharTrie perf tests on a dictionary text file, specify the -f or --file-name.\n");
|
||||
}
|
||||
switch(index) {
|
||||
case 0:
|
||||
name="simplebinarysearch";
|
||||
if(exec) {
|
||||
return new BinarySearchPackageLookup(*this);
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
name="prefixbinarysearch";
|
||||
if(exec) {
|
||||
return new PrefixBinarySearchPackageLookup(*this);
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
name="bytetrie";
|
||||
if(exec) {
|
||||
return new ByteTriePackageLookup(*this);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
name="";
|
||||
break;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int main(int argc, const char *argv[]) {
|
||||
IcuToolErrorCode errorCode("dicttrieperf main()");
|
||||
DictionaryTriePerfTest test(argc, argv, errorCode);
|
||||
if(errorCode.isFailure()) {
|
||||
fprintf(stderr, "DictionaryTriePerfTest() failed: %s\n", errorCode.errorName());
|
||||
test.usage();
|
||||
return errorCode.reset();
|
||||
}
|
||||
if(!test.run()) {
|
||||
fprintf(stderr, "FAILED: Tests could not be run, please check the arguments.\n");
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2002-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 2002-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -203,6 +203,12 @@ void UPerfTest::init(UOption addOptions[], int32_t addOptionsCount,
|
|||
}
|
||||
|
||||
ULine* UPerfTest::getLines(UErrorCode& status){
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
if (lines != NULL) {
|
||||
return lines; // don't do it again
|
||||
}
|
||||
lines = new ULine[MAXLINES];
|
||||
int maxLines = MAXLINES;
|
||||
numLines=0;
|
||||
|
|
|
@ -52,6 +52,9 @@ LDFLAGS += $(LDFLAGSICUTOOLUTIL)
|
|||
LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS)
|
||||
|
||||
OBJECTS = filestrm.o package.o pkgitems.o swapimpl.o toolutil.o unewdata.o \
|
||||
dicttriebuilder.o bytetriebuilder.o bytetrieiterator.o \
|
||||
uchartrie.o uchartriebuilder.o uchartrieiterator.o \
|
||||
denseranges.o \
|
||||
ucm.o ucmstate.o uoptions.o uparse.o \
|
||||
ucbuf.o xmlparser.o writesrc.o \
|
||||
pkg_icu.o pkg_genc.o pkg_gencmn.o flagparser.o filetools.o \
|
||||
|
|
755
icu4c/source/tools/toolutil/bytetriebuilder.cpp
Normal file
755
icu4c/source/tools/toolutil/bytetriebuilder.cpp
Normal file
|
@ -0,0 +1,755 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetriebuilder.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010sep25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Builder class for ByteTrie dictionary trie.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "bytetrie.h"
|
||||
#include "bytetriebuilder.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "uarrsort.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* Note: This builder implementation stores (bytes, value) pairs with full copies
|
||||
* of the byte sequences, until the ByteTrie is built.
|
||||
* It might(!) take less memory if we collected the data in a temporary, dynamic trie.
|
||||
*/
|
||||
|
||||
class ByteTrieElement : public UMemory {
|
||||
public:
|
||||
// Use compiler's default constructor, initializes nothing.
|
||||
|
||||
void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
|
||||
|
||||
StringPiece getString(const CharString &strings) const {
|
||||
int32_t offset=stringOffset;
|
||||
int32_t length;
|
||||
if(offset>=0) {
|
||||
length=(uint8_t)strings[offset++];
|
||||
} else {
|
||||
offset=~offset;
|
||||
length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
|
||||
offset+=2;
|
||||
}
|
||||
return StringPiece(strings.data()+offset, length);
|
||||
}
|
||||
int32_t getStringLength(const CharString &strings) const {
|
||||
int32_t offset=stringOffset;
|
||||
if(offset>=0) {
|
||||
return (uint8_t)strings[offset];
|
||||
} else {
|
||||
offset=~offset;
|
||||
return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
|
||||
}
|
||||
}
|
||||
|
||||
char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
|
||||
|
||||
int32_t getValue() const { return value; }
|
||||
|
||||
int32_t compareStringTo(const ByteTrieElement &o, const CharString &strings) const;
|
||||
|
||||
private:
|
||||
const char *data(const CharString &strings) const {
|
||||
int32_t offset=stringOffset;
|
||||
if(offset>=0) {
|
||||
++offset;
|
||||
} else {
|
||||
offset=~offset+2;
|
||||
}
|
||||
return strings.data()+offset;
|
||||
}
|
||||
|
||||
// If the stringOffset is non-negative, then the first strings byte contains
|
||||
// the string length.
|
||||
// If the stringOffset is negative, then the first two strings bytes contain
|
||||
// the string length (big-endian), and the offset needs to be bit-inverted.
|
||||
// (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
|
||||
int32_t stringOffset;
|
||||
int32_t value;
|
||||
};
|
||||
|
||||
void
|
||||
ByteTrieElement::setTo(const StringPiece &s, int32_t val,
|
||||
CharString &strings, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
int32_t length=s.length();
|
||||
if(length>0xffff) {
|
||||
// Too long: We store the length in 1 or 2 bytes.
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
int32_t offset=strings.length();
|
||||
if(length>0xff) {
|
||||
offset=~offset;
|
||||
strings.append((char)(length>>8), errorCode);
|
||||
}
|
||||
strings.append((char)length, errorCode);
|
||||
stringOffset=offset;
|
||||
value=val;
|
||||
strings.append(s, errorCode);
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieElement::compareStringTo(const ByteTrieElement &other, const CharString &strings) const {
|
||||
// TODO: add StringPiece::compare(), see ticket #8187
|
||||
StringPiece thisString=getString(strings);
|
||||
StringPiece otherString=other.getString(strings);
|
||||
int32_t lengthDiff=thisString.length()-otherString.length();
|
||||
int32_t commonLength;
|
||||
if(lengthDiff<=0) {
|
||||
commonLength=thisString.length();
|
||||
} else {
|
||||
commonLength=otherString.length();
|
||||
}
|
||||
int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
|
||||
return diff!=0 ? diff : lengthDiff;
|
||||
}
|
||||
|
||||
ByteTrieBuilder::~ByteTrieBuilder() {
|
||||
delete[] elements;
|
||||
uprv_free(bytes);
|
||||
}
|
||||
|
||||
ByteTrieBuilder &
|
||||
ByteTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return *this;
|
||||
}
|
||||
if(bytesLength>0) {
|
||||
// Cannot add elements after building.
|
||||
errorCode=U_NO_WRITE_PERMISSION;
|
||||
return *this;
|
||||
}
|
||||
bytesCapacity+=s.length()+1; // Crude bytes preallocation estimate.
|
||||
if(elementsLength==elementsCapacity) {
|
||||
int32_t newCapacity;
|
||||
if(elementsCapacity==0) {
|
||||
newCapacity=1024;
|
||||
} else {
|
||||
newCapacity=4*elementsCapacity;
|
||||
}
|
||||
ByteTrieElement *newElements=new ByteTrieElement[newCapacity];
|
||||
if(newElements==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if(elementsLength>0) {
|
||||
uprv_memcpy(newElements, elements, elementsLength*sizeof(ByteTrieElement));
|
||||
}
|
||||
delete[] elements;
|
||||
elements=newElements;
|
||||
elementsCapacity=newCapacity;
|
||||
}
|
||||
elements[elementsLength++].setTo(s, value, strings, errorCode);
|
||||
return *this;
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
compareElementStrings(const void *context, const void *left, const void *right) {
|
||||
const CharString *strings=reinterpret_cast<const CharString *>(context);
|
||||
const ByteTrieElement *leftElement=reinterpret_cast<const ByteTrieElement *>(left);
|
||||
const ByteTrieElement *rightElement=reinterpret_cast<const ByteTrieElement *>(right);
|
||||
return leftElement->compareStringTo(*rightElement, *strings);
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
StringPiece
|
||||
ByteTrieBuilder::build(UDictTrieBuildOption buildOption, UErrorCode &errorCode) {
|
||||
StringPiece result;
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return result;
|
||||
}
|
||||
if(bytesLength>0) {
|
||||
// Already built.
|
||||
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
|
||||
return result;
|
||||
}
|
||||
if(elementsLength==0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return result;
|
||||
}
|
||||
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(ByteTrieElement),
|
||||
compareElementStrings, &strings,
|
||||
FALSE, // need not be a stable sort
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return result;
|
||||
}
|
||||
// Duplicate strings are not allowed.
|
||||
StringPiece prev=elements[0].getString(strings);
|
||||
for(int32_t i=1; i<elementsLength; ++i) {
|
||||
StringPiece current=elements[i].getString(strings);
|
||||
if(prev==current) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return result;
|
||||
}
|
||||
prev=current;
|
||||
}
|
||||
// Create and byte-serialize the trie for the elements.
|
||||
if(bytesCapacity<1024) {
|
||||
bytesCapacity=1024;
|
||||
}
|
||||
bytes=reinterpret_cast<char *>(uprv_malloc(bytesCapacity));
|
||||
if(bytes==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return result;
|
||||
}
|
||||
if(buildOption==UDICTTRIE_BUILD_FAST) {
|
||||
writeNode(0, elementsLength, 0);
|
||||
} else /* UDICTTRIE_BUILD_SMALL */ {
|
||||
createCompactBuilder(2*elementsLength, errorCode);
|
||||
Node *root=makeNode(0, elementsLength, 0, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
root->markRightEdgesFirst(-1);
|
||||
root->write(*this);
|
||||
}
|
||||
deleteCompactBuilder();
|
||||
}
|
||||
if(bytes==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length byteIndex.
|
||||
void
|
||||
ByteTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t byteIndex) {
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
if(byteIndex==elements[start].getStringLength(strings)) {
|
||||
// An intermediate or final value.
|
||||
value=elements[start++].getValue();
|
||||
if(start==limit) {
|
||||
writeValueAndFinal(value, TRUE); // final-value node
|
||||
return;
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
// Now all [start..limit[ strings are longer than byteIndex.
|
||||
const ByteTrieElement &minElement=elements[start];
|
||||
const ByteTrieElement &maxElement=elements[limit-1];
|
||||
int32_t minByte=(uint8_t)minElement.charAt(byteIndex, strings);
|
||||
int32_t maxByte=(uint8_t)maxElement.charAt(byteIndex, strings);
|
||||
if(minByte==maxByte) {
|
||||
// Linear-match node: All strings have the same character at byteIndex.
|
||||
int32_t minStringLength=minElement.getStringLength(strings);
|
||||
int32_t lastByteIndex=byteIndex;
|
||||
while(++lastByteIndex<minStringLength &&
|
||||
minElement.charAt(lastByteIndex, strings)==
|
||||
maxElement.charAt(lastByteIndex, strings)) {}
|
||||
writeNode(start, limit, lastByteIndex);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
const char *s=minElement.getString(strings).data();
|
||||
int32_t length=lastByteIndex-byteIndex;
|
||||
while(length>ByteTrie::kMaxLinearMatchLength) {
|
||||
lastByteIndex-=ByteTrie::kMaxLinearMatchLength;
|
||||
length-=ByteTrie::kMaxLinearMatchLength;
|
||||
write(s+lastByteIndex, ByteTrie::kMaxLinearMatchLength);
|
||||
write(ByteTrie::kMinLinearMatch+ByteTrie::kMaxLinearMatchLength-1);
|
||||
}
|
||||
write(s+byteIndex, length);
|
||||
write(ByteTrie::kMinLinearMatch+length-1);
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=0; // Number of different bytes at byteIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
// length>=2 because minByte!=maxByte.
|
||||
writeBranchSubNode(start, limit, byteIndex, length);
|
||||
write(--length);
|
||||
if(length>=ByteTrie::kMinLinearMatch) {
|
||||
write(0);
|
||||
}
|
||||
}
|
||||
if(hasValue) {
|
||||
writeValueAndFinal(value, FALSE);
|
||||
}
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than byteIndex &&
|
||||
// length different bytes at byteIndex
|
||||
void
|
||||
ByteTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length) {
|
||||
char middleBytes[16];
|
||||
int32_t lessThan[16];
|
||||
int32_t ltLength=0;
|
||||
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
|
||||
// Branch on the middle byte.
|
||||
// First, find the middle byte.
|
||||
int32_t count=length/2;
|
||||
int32_t i=start;
|
||||
char byte;
|
||||
do {
|
||||
byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
// Encode the less-than branch first.
|
||||
byte=middleBytes[ltLength]=elements[i].charAt(byteIndex, strings); // middle byte
|
||||
writeBranchSubNode(start, i, byteIndex, length/2);
|
||||
lessThan[ltLength]=bytesLength;
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
// For each byte, find its elements array start and whether it has a final value.
|
||||
int32_t starts[ByteTrie::kMaxBranchLinearSubNodeLength];
|
||||
UBool final[ByteTrie::kMaxBranchLinearSubNodeLength-1];
|
||||
int32_t byteNumber=0;
|
||||
do {
|
||||
int32_t i=starts[byteNumber]=start;
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
final[byteNumber]= start==i-1 && byteIndex+1==elements[start].getStringLength(strings);
|
||||
start=i;
|
||||
} while(++byteNumber<length-1);
|
||||
// byteNumber==length-1, and the maxByte elements range is [start..limit[
|
||||
starts[byteNumber]=start;
|
||||
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minByte sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minByte sub-node last, for a shorter delta.
|
||||
int32_t jumpTargets[ByteTrie::kMaxBranchLinearSubNodeLength-1];
|
||||
do {
|
||||
--byteNumber;
|
||||
if(!final[byteNumber]) {
|
||||
writeNode(starts[byteNumber], starts[byteNumber+1], byteIndex+1);
|
||||
jumpTargets[byteNumber]=bytesLength;
|
||||
}
|
||||
} while(byteNumber>0);
|
||||
// The maxByte sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
byteNumber=length-1;
|
||||
writeNode(start, limit, byteIndex+1);
|
||||
write((uint8_t)elements[start].charAt(byteIndex, strings));
|
||||
// Write the rest of this node's byte-value pairs.
|
||||
while(--byteNumber>=0) {
|
||||
start=starts[byteNumber];
|
||||
int32_t value;
|
||||
if(final[byteNumber]) {
|
||||
// Write the final value for the one string ending with this byte.
|
||||
value=elements[start].getValue();
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
value=bytesLength-jumpTargets[byteNumber];
|
||||
}
|
||||
writeValueAndFinal(value, final[byteNumber]);
|
||||
write((uint8_t)elements[start].charAt(byteIndex, strings));
|
||||
}
|
||||
// Write the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
writeDelta(bytesLength-lessThan[ltLength]); // less-than
|
||||
write((uint8_t)middleBytes[ltLength]);
|
||||
}
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length byteIndex.
|
||||
DictTrieBuilder::Node *
|
||||
ByteTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t byteIndex, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
if(byteIndex==elements[start].getStringLength(strings)) {
|
||||
// An intermediate or final value.
|
||||
value=elements[start++].getValue();
|
||||
if(start==limit) {
|
||||
return registerFinalValue(value, errorCode);
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
Node *node;
|
||||
// Now all [start..limit[ strings are longer than byteIndex.
|
||||
const ByteTrieElement &minElement=elements[start];
|
||||
const ByteTrieElement &maxElement=elements[limit-1];
|
||||
int32_t minByte=(uint8_t)minElement.charAt(byteIndex, strings);
|
||||
int32_t maxByte=(uint8_t)maxElement.charAt(byteIndex, strings);
|
||||
if(minByte==maxByte) {
|
||||
// Linear-match node: All strings have the same character at byteIndex.
|
||||
int32_t minStringLength=minElement.getStringLength(strings);
|
||||
int32_t lastByteIndex=byteIndex;
|
||||
while(++lastByteIndex<minStringLength &&
|
||||
minElement.charAt(lastByteIndex, strings)==
|
||||
maxElement.charAt(lastByteIndex, strings)) {}
|
||||
Node *nextNode=makeNode(start, limit, lastByteIndex, errorCode);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
const char *s=minElement.getString(strings).data();
|
||||
int32_t length=lastByteIndex-byteIndex;
|
||||
while(length>ByteTrie::kMaxLinearMatchLength) {
|
||||
lastByteIndex-=ByteTrie::kMaxLinearMatchLength;
|
||||
length-=ByteTrie::kMaxLinearMatchLength;
|
||||
node=new BTLinearMatchNode(
|
||||
s+lastByteIndex,
|
||||
ByteTrie::kMaxLinearMatchLength,
|
||||
nextNode);
|
||||
node=registerNode(node, errorCode);
|
||||
nextNode=node;
|
||||
}
|
||||
node=new BTLinearMatchNode(s+byteIndex, length, nextNode);
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=0; // Number of different bytes at byteIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
// length>=2 because minByte!=maxByte.
|
||||
Node *subNode=makeBranchSubNode(start, limit, byteIndex, length, errorCode);
|
||||
node=new BTBranchHeadNode(length, subNode);
|
||||
}
|
||||
node=registerNode(node, errorCode);
|
||||
if(hasValue) {
|
||||
node=registerNode(new BTValueNode(value, node), errorCode);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than byteIndex &&
|
||||
// length different bytes at byteIndex
|
||||
DictTrieBuilder::Node *
|
||||
ByteTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex,
|
||||
int32_t length, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
char middleBytes[16];
|
||||
Node *lessThan[16];
|
||||
int32_t ltLength=0;
|
||||
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
|
||||
// Branch on the middle byte.
|
||||
// First, find the middle byte.
|
||||
int32_t count=length/2;
|
||||
int32_t i=start;
|
||||
char byte;
|
||||
do {
|
||||
byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
// Encode the less-than branch first.
|
||||
byte=middleBytes[ltLength]=elements[i].charAt(byteIndex, strings); // middle byte
|
||||
lessThan[ltLength]=makeBranchSubNode(start, i, byteIndex, length/2, errorCode);
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
BTListBranchNode *listNode=new BTListBranchNode();
|
||||
if(listNode==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
// For each byte, find its elements array start and whether it has a final value.
|
||||
int32_t byteNumber=0;
|
||||
do {
|
||||
int32_t i=start;
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
if(start==i-1 && byteIndex+1==elements[start].getStringLength(strings)) {
|
||||
listNode->add((uint8_t)byte, elements[start].getValue());
|
||||
} else {
|
||||
listNode->add((uint8_t)byte, makeNode(start, i, byteIndex+1, errorCode));
|
||||
}
|
||||
start=i;
|
||||
} while(++byteNumber<length-1);
|
||||
// byteNumber==length-1, and the maxByte elements range is [start..limit[
|
||||
char byte=elements[start].charAt(byteIndex, strings);
|
||||
if(start==limit-1 && byteIndex+1==elements[start].getStringLength(strings)) {
|
||||
listNode->add((uint8_t)byte, elements[start].getValue());
|
||||
} else {
|
||||
listNode->add((uint8_t)byte, makeNode(start, limit, byteIndex+1, errorCode));
|
||||
}
|
||||
Node *node=registerNode(listNode, errorCode);
|
||||
// Create the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
node=registerNode(
|
||||
new BTSplitBranchNode(middleBytes[ltLength], lessThan[ltLength], node), errorCode);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTFinalValueNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
offset=b.writeValueAndFinal(value, TRUE);
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteTrieBuilder::BTValueNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!ValueNode::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const BTValueNode &o=(const BTValueNode &)other;
|
||||
return next==o.next;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::BTValueNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTValueNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
offset=b.writeValueAndFinal(value, FALSE);
|
||||
}
|
||||
|
||||
ByteTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
|
||||
: LinearMatchNode(len, nextNode), s(bytes) {
|
||||
hash=hash*37+uhash_hashCharsN(bytes, len);
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!LinearMatchNode::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
|
||||
return 0==uprv_memcmp(s, o.s, length);
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTLinearMatchNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
b.write(s, length);
|
||||
offset=b.write(ByteTrie::kMinLinearMatch+length-1);
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTListBranchNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minByte sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minByte sub-node last, for a shorter delta.
|
||||
int32_t byteNumber=length-1;
|
||||
Node *rightEdge=equal[byteNumber];
|
||||
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
|
||||
do {
|
||||
--byteNumber;
|
||||
if(equal[byteNumber]!=NULL) {
|
||||
equal[byteNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
|
||||
}
|
||||
} while(byteNumber>0);
|
||||
// The maxByte sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
byteNumber=length-1;
|
||||
if(rightEdge==NULL) {
|
||||
b.writeValueAndFinal(values[byteNumber], TRUE);
|
||||
} else {
|
||||
rightEdge->write(builder);
|
||||
}
|
||||
b.write(units[byteNumber]);
|
||||
// Write the rest of this node's byte-value pairs.
|
||||
while(--byteNumber>=0) {
|
||||
int32_t value;
|
||||
UBool isFinal;
|
||||
if(equal[byteNumber]==NULL) {
|
||||
// Write the final value for the one string ending with this byte.
|
||||
value=values[byteNumber];
|
||||
isFinal=TRUE;
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
U_ASSERT(equal[byteNumber]->getOffset()>0);
|
||||
value=b.bytesLength-equal[byteNumber]->getOffset();
|
||||
isFinal=FALSE;
|
||||
}
|
||||
b.writeValueAndFinal(value, isFinal);
|
||||
offset=b.write(units[byteNumber]);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTSplitBranchNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
// Encode the less-than branch first.
|
||||
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
|
||||
// Encode the greater-or-equal branch last because we do not jump for it at all.
|
||||
greaterOrEqual->write(builder);
|
||||
// Write this node.
|
||||
U_ASSERT(lessThan->getOffset()>0);
|
||||
b.writeDelta(b.bytesLength-lessThan->getOffset()); // less-than
|
||||
offset=b.write(unit);
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTBranchHeadNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
offset=b.write((length-1));
|
||||
if(length>ByteTrie::kMinLinearMatch) {
|
||||
offset=b.write(0);
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteTrieBuilder::ensureCapacity(int32_t length) {
|
||||
if(bytes==NULL) {
|
||||
return FALSE; // previous memory allocation had failed
|
||||
}
|
||||
if(length>bytesCapacity) {
|
||||
int32_t newCapacity=bytesCapacity;
|
||||
do {
|
||||
newCapacity*=2;
|
||||
} while(newCapacity<=length);
|
||||
char *newBytes=reinterpret_cast<char *>(uprv_malloc(newCapacity));
|
||||
if(newBytes==NULL) {
|
||||
// unable to allocate memory
|
||||
uprv_free(bytes);
|
||||
bytes=NULL;
|
||||
return FALSE;
|
||||
}
|
||||
uprv_memcpy(newBytes+(newCapacity-bytesLength),
|
||||
bytes+(bytesCapacity-bytesLength), bytesLength);
|
||||
uprv_free(bytes);
|
||||
bytes=newBytes;
|
||||
bytesCapacity=newCapacity;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::write(int32_t byte) {
|
||||
int32_t newLength=bytesLength+1;
|
||||
if(ensureCapacity(newLength)) {
|
||||
bytesLength=newLength;
|
||||
bytes[bytesCapacity-bytesLength]=(char)byte;
|
||||
}
|
||||
return bytesLength;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::write(const char *b, int32_t length) {
|
||||
int32_t newLength=bytesLength+length;
|
||||
if(ensureCapacity(newLength)) {
|
||||
bytesLength=newLength;
|
||||
uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
|
||||
}
|
||||
return bytesLength;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
|
||||
char intBytes[5];
|
||||
int32_t length=1;
|
||||
if(i<0 || i>0xffffff) {
|
||||
intBytes[0]=(char)ByteTrie::kFiveByteValueLead;
|
||||
intBytes[1]=(char)(i>>24);
|
||||
intBytes[2]=(char)(i>>16);
|
||||
intBytes[3]=(char)(i>>8);
|
||||
intBytes[4]=(char)i;
|
||||
length=5;
|
||||
} else if(i<=ByteTrie::kMaxOneByteValue) {
|
||||
intBytes[0]=(char)(ByteTrie::kMinOneByteValueLead+i);
|
||||
} else {
|
||||
if(i<=ByteTrie::kMaxTwoByteValue) {
|
||||
intBytes[0]=(char)(ByteTrie::kMinTwoByteValueLead+(i>>8));
|
||||
} else {
|
||||
if(i<=ByteTrie::kMaxThreeByteValue) {
|
||||
intBytes[0]=(char)(ByteTrie::kMinThreeByteValueLead+(i>>16));
|
||||
} else {
|
||||
intBytes[0]=(char)ByteTrie::kFourByteValueLead;
|
||||
intBytes[1]=(char)(i>>16);
|
||||
length=2;
|
||||
}
|
||||
intBytes[length++]=(char)(i>>8);
|
||||
}
|
||||
intBytes[length++]=(char)i;
|
||||
}
|
||||
intBytes[0]=(char)((intBytes[0]<<1)|final);
|
||||
return write(intBytes, length);
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::writeDelta(int32_t i) {
|
||||
char intBytes[5];
|
||||
int32_t length;
|
||||
U_ASSERT(i>=0);
|
||||
if(i<=ByteTrie::kMaxOneByteDelta) {
|
||||
length=0;
|
||||
} else if(i<=ByteTrie::kMaxTwoByteDelta) {
|
||||
intBytes[0]=(char)(ByteTrie::kMinTwoByteDeltaLead+(i>>8));
|
||||
length=1;
|
||||
} else {
|
||||
if(i<=ByteTrie::kMaxThreeByteDelta) {
|
||||
intBytes[0]=(char)(ByteTrie::kMinThreeByteDeltaLead+(i>>16));
|
||||
length=2;
|
||||
} else {
|
||||
if(i<=0xffffff) {
|
||||
intBytes[0]=(char)ByteTrie::kFourByteDeltaLead;
|
||||
length=3;
|
||||
} else {
|
||||
intBytes[0]=(char)ByteTrie::kFiveByteDeltaLead;
|
||||
intBytes[1]=(char)(i>>24);
|
||||
length=4;
|
||||
}
|
||||
intBytes[1]=(char)(i>>16);
|
||||
}
|
||||
intBytes[1]=(char)(i>>8);
|
||||
}
|
||||
intBytes[length++]=(char)i;
|
||||
return write(intBytes, length);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
123
icu4c/source/tools/toolutil/bytetriebuilder.h
Normal file
123
icu4c/source/tools/toolutil/bytetriebuilder.h
Normal file
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetriebuilder.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010sep25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Builder class for ByteTrie dictionary trie.
|
||||
*/
|
||||
|
||||
#ifndef __BYTETRIEBUILDER_H__
|
||||
#define __BYTETRIEBUILDER_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "charstr.h"
|
||||
#include "dicttriebuilder.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class ByteTrieElement;
|
||||
|
||||
class U_TOOLUTIL_API ByteTrieBuilder : public DictTrieBuilder {
|
||||
public:
|
||||
ByteTrieBuilder()
|
||||
: elements(NULL), elementsCapacity(0), elementsLength(0),
|
||||
bytes(NULL), bytesCapacity(0), bytesLength(0) {}
|
||||
~ByteTrieBuilder();
|
||||
|
||||
ByteTrieBuilder &add(const StringPiece &s, int32_t value, UErrorCode &errorCode);
|
||||
|
||||
StringPiece build(UDictTrieBuildOption buildOption, UErrorCode &errorCode);
|
||||
|
||||
ByteTrieBuilder &clear() {
|
||||
strings.clear();
|
||||
elementsLength=0;
|
||||
bytesLength=0;
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
void writeNode(int32_t start, int32_t limit, int32_t byteIndex);
|
||||
void writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length);
|
||||
|
||||
Node *makeNode(int32_t start, int32_t limit, int32_t byteIndex, UErrorCode &errorCode);
|
||||
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex,
|
||||
int32_t length, UErrorCode &errorCode);
|
||||
|
||||
UBool ensureCapacity(int32_t length);
|
||||
int32_t write(int32_t byte);
|
||||
int32_t write(const char *b, int32_t length);
|
||||
int32_t writeValueAndFinal(int32_t i, UBool final);
|
||||
int32_t writeDelta(int32_t i);
|
||||
|
||||
// Compacting builder.
|
||||
class BTFinalValueNode : public FinalValueNode {
|
||||
public:
|
||||
BTFinalValueNode(int32_t v) : FinalValueNode(v) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class BTValueNode : public ValueNode {
|
||||
public:
|
||||
BTValueNode(int32_t v, Node *nextNode)
|
||||
: ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
private:
|
||||
Node *next;
|
||||
};
|
||||
|
||||
class BTLinearMatchNode : public LinearMatchNode {
|
||||
public:
|
||||
BTLinearMatchNode(const char *units, int32_t len, Node *nextNode);
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
private:
|
||||
const char *s;
|
||||
};
|
||||
|
||||
class BTListBranchNode : public ListBranchNode {
|
||||
public:
|
||||
BTListBranchNode() : ListBranchNode() {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class BTSplitBranchNode : public SplitBranchNode {
|
||||
public:
|
||||
BTSplitBranchNode(char middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
|
||||
: SplitBranchNode((uint8_t)middleUnit, lessThanNode, greaterOrEqualNode) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class BTBranchHeadNode : public BranchHeadNode {
|
||||
public:
|
||||
BTBranchHeadNode(int32_t len, Node *subNode) : BranchHeadNode(len, subNode) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
virtual Node *createFinalValueNode(int32_t value) const { return new BTFinalValueNode(value); }
|
||||
|
||||
CharString strings;
|
||||
ByteTrieElement *elements;
|
||||
int32_t elementsCapacity;
|
||||
int32_t elementsLength;
|
||||
|
||||
// Byte serialization of the trie.
|
||||
// Grows from the back: bytesLength measures from the end of the buffer!
|
||||
char *bytes;
|
||||
int32_t bytesCapacity;
|
||||
int32_t bytesLength;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __BYTETRIEBUILDER_H__
|
167
icu4c/source/tools/toolutil/bytetrieiterator.cpp
Normal file
167
icu4c/source/tools/toolutil/bytetrieiterator.cpp
Normal file
|
@ -0,0 +1,167 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetrieiterator.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov03
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "bytetrie.h"
|
||||
#include "bytetrieiterator.h"
|
||||
#include "charstr.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
ByteTrieIterator::ByteTrieIterator(const void *trieBytes, int32_t maxStringLength,
|
||||
UErrorCode &errorCode)
|
||||
: bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
|
||||
pos_(bytes_), initialPos_(bytes_),
|
||||
remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
|
||||
maxLength_(maxStringLength), value_(0), stack_(errorCode) {}
|
||||
|
||||
ByteTrieIterator::ByteTrieIterator(const ByteTrie &trie, int32_t maxStringLength,
|
||||
UErrorCode &errorCode)
|
||||
: bytes_(trie.bytes_), pos_(trie.pos_), initialPos_(trie.pos_),
|
||||
remainingMatchLength_(trie.remainingMatchLength_),
|
||||
initialRemainingMatchLength_(trie.remainingMatchLength_),
|
||||
maxLength_(maxStringLength), value_(0), stack_(errorCode) {
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
if(length>=0) {
|
||||
// Pending linear-match node, append remaining bytes to str.
|
||||
++length;
|
||||
if(maxLength_>0 && length>maxLength_) {
|
||||
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
|
||||
}
|
||||
str_.append(reinterpret_cast<const char *>(pos_), length, errorCode);
|
||||
pos_+=length;
|
||||
remainingMatchLength_-=length;
|
||||
}
|
||||
}
|
||||
|
||||
ByteTrieIterator &ByteTrieIterator::reset() {
|
||||
pos_=initialPos_;
|
||||
remainingMatchLength_=initialRemainingMatchLength_;
|
||||
int32_t length=remainingMatchLength_+1; // Remaining match length.
|
||||
if(maxLength_>0 && length>maxLength_) {
|
||||
length=maxLength_;
|
||||
}
|
||||
str_.truncate(length);
|
||||
pos_+=length;
|
||||
remainingMatchLength_-=length;
|
||||
stack_.setSize(0);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteTrieIterator::next(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
const uint8_t *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
if(stack_.isEmpty()) {
|
||||
return FALSE;
|
||||
}
|
||||
// Pop the state off the stack and continue with the next outbound edge of
|
||||
// the branch node.
|
||||
int32_t stackSize=stack_.size();
|
||||
int32_t length=stack_.elementAti(stackSize-1);
|
||||
pos=bytes_+stack_.elementAti(stackSize-2);
|
||||
stack_.setSize(stackSize-2);
|
||||
str_.truncate(length&0xffff);
|
||||
length=(int32_t)((uint32_t)length>>16);
|
||||
if(length>1) {
|
||||
pos=branchNext(pos, length, errorCode);
|
||||
if(pos==NULL) {
|
||||
return TRUE; // Reached a final value.
|
||||
}
|
||||
} else {
|
||||
str_.append((char)*pos++, errorCode);
|
||||
}
|
||||
}
|
||||
if(remainingMatchLength_>=0) {
|
||||
// We only get here if we started in a pending linear-match node
|
||||
// with more than maxLength remaining bytes.
|
||||
return truncateAndStop();
|
||||
}
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node>=ByteTrie::kMinValueLead) {
|
||||
// Deliver value for the byte sequence so far.
|
||||
UBool isFinal=(UBool)(node&ByteTrie::kValueIsFinal);
|
||||
value_=ByteTrie::readValue(pos, node>>1);
|
||||
if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
|
||||
pos_=NULL;
|
||||
} else {
|
||||
pos_=ByteTrie::skipValue(pos, node);
|
||||
}
|
||||
sp_.set(str_.data(), str_.length());
|
||||
return TRUE;
|
||||
}
|
||||
if(maxLength_>0 && str_.length()==maxLength_) {
|
||||
return truncateAndStop();
|
||||
}
|
||||
if(node<ByteTrie::kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
pos=branchNext(pos, node+1, errorCode);
|
||||
if(pos==NULL) {
|
||||
return TRUE; // Reached a final value.
|
||||
}
|
||||
} else {
|
||||
// Linear-match node, append length bytes to str_.
|
||||
int32_t length=node-ByteTrie::kMinLinearMatch+1;
|
||||
if(maxLength_>0 && str_.length()+length>maxLength_) {
|
||||
str_.append(reinterpret_cast<const char *>(pos),
|
||||
maxLength_-str_.length(), errorCode);
|
||||
return truncateAndStop();
|
||||
}
|
||||
str_.append(reinterpret_cast<const char *>(pos), length, errorCode);
|
||||
pos+=length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Branch node, needs to take the first outbound edge and push state for the rest.
|
||||
const uint8_t *
|
||||
ByteTrieIterator::branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode) {
|
||||
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison byte
|
||||
// Push state for the greater-or-equal edge.
|
||||
stack_.addElement((int32_t)(ByteTrie::skipDelta(pos)-bytes_), errorCode);
|
||||
stack_.addElement(((length-(length>>1))<<16)|str_.length(), errorCode);
|
||||
// Follow the less-than edge.
|
||||
length>>=1;
|
||||
pos=ByteTrie::jumpByDelta(pos);
|
||||
}
|
||||
// List of key-value pairs where values are either final values or jump deltas.
|
||||
// Read the first (key, value) pair.
|
||||
uint8_t trieByte=*pos++;
|
||||
int32_t node=*pos++;
|
||||
UBool isFinal=(UBool)(node&ByteTrie::kValueIsFinal);
|
||||
int32_t value=ByteTrie::readValue(pos, node>>1);
|
||||
pos=ByteTrie::skipValue(pos, node);
|
||||
stack_.addElement((int32_t)(pos-bytes_), errorCode);
|
||||
stack_.addElement(((length-1)<<16)|str_.length(), errorCode);
|
||||
str_.append((char)trieByte, errorCode);
|
||||
if(isFinal) {
|
||||
pos_=NULL;
|
||||
sp_.set(str_.data(), str_.length());
|
||||
value_=value;
|
||||
return NULL;
|
||||
} else {
|
||||
return pos+value;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
126
icu4c/source/tools/toolutil/bytetrieiterator.h
Normal file
126
icu4c/source/tools/toolutil/bytetrieiterator.h
Normal file
|
@ -0,0 +1,126 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetrieiterator.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov03
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __BYTETRIEITERATOR_H__
|
||||
#define __BYTETRIEITERATOR_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: ByteTrie iterator for all of its (byte sequence, value) pairs.
|
||||
*/
|
||||
|
||||
// Needed if and when we change the .dat package index to a ByteTrie,
|
||||
// so that icupkg can work with an input package.
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "bytetrie.h"
|
||||
#include "charstr.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Iterator for all of the (byte sequence, value) pairs in a ByteTrie.
|
||||
*/
|
||||
class U_TOOLUTIL_API ByteTrieIterator : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* Iterates from the root of a byte-serialized ByteTrie.
|
||||
* @param trieBytes The trie bytes.
|
||||
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
|
||||
* Otherwise, the iterator returns strings with this maximum length.
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
*/
|
||||
ByteTrieIterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Iterates from the current state of the specified ByteTrie.
|
||||
* @param trie The trie whose state will be copied for iteration.
|
||||
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
|
||||
* Otherwise, the iterator returns strings with this maximum length.
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
*/
|
||||
ByteTrieIterator(const ByteTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Resets this iterator to its initial state.
|
||||
*/
|
||||
ByteTrieIterator &reset();
|
||||
|
||||
/**
|
||||
* Finds the next (byte sequence, value) pair if there is one.
|
||||
*
|
||||
* If the byte sequence is truncated to the maximum length and does not
|
||||
* have a real value, then the value is set to -1.
|
||||
* In this case, this "not a real value" is indistinguishable from
|
||||
* a real value of -1.
|
||||
* @return TRUE if there is another element.
|
||||
*/
|
||||
UBool next(UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* @return TRUE if there are more elements.
|
||||
*/
|
||||
UBool hasNext() const { return pos_!=NULL || !stack_.isEmpty(); }
|
||||
|
||||
/**
|
||||
* @return the NUL-terminated byte sequence for the last successful next()
|
||||
*/
|
||||
const StringPiece &getString() const { return sp_; }
|
||||
/**
|
||||
* @return the value for the last successful next()
|
||||
*/
|
||||
int32_t getValue() const { return value_; }
|
||||
|
||||
private:
|
||||
UBool truncateAndStop() {
|
||||
pos_=NULL;
|
||||
value_=-1; // no real value for str
|
||||
sp_.set(str_.data(), str_.length());
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
|
||||
|
||||
const uint8_t *bytes_;
|
||||
const uint8_t *pos_;
|
||||
const uint8_t *initialPos_;
|
||||
int32_t remainingMatchLength_;
|
||||
int32_t initialRemainingMatchLength_;
|
||||
|
||||
CharString str_;
|
||||
StringPiece sp_;
|
||||
int32_t maxLength_;
|
||||
int32_t value_;
|
||||
|
||||
// The stack stores pairs of integers for backtracking to another
|
||||
// outbound edge of a branch node.
|
||||
// The first integer is an offset from ByteTrie.bytes.
|
||||
// The second integer has the str.length() from before the node in bits 15..0,
|
||||
// and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
|
||||
// (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
|
||||
// but the code looks more confusing that way.)
|
||||
UVector32 stack_;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __BYTETRIEITERATOR_H__
|
158
icu4c/source/tools/toolutil/denseranges.cpp
Normal file
158
icu4c/source/tools/toolutil/denseranges.cpp
Normal file
|
@ -0,0 +1,158 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: denseranges.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010sep25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Helper code for finding a small number of dense ranges.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "denseranges.h"
|
||||
|
||||
// Definitions in the anonymous namespace are invisible outside this file.
|
||||
namespace {
|
||||
|
||||
/**
|
||||
* Collect up to 15 range gaps and sort them by ascending gap size.
|
||||
*/
|
||||
class LargestGaps {
|
||||
public:
|
||||
LargestGaps(int32_t max) : maxLength(max<=kCapacity ? max : kCapacity), length(0) {}
|
||||
|
||||
void add(int32_t gapStart, int64_t gapLength) {
|
||||
int32_t i=length;
|
||||
while(i>0 && gapLength>gapLengths[i-1]) {
|
||||
--i;
|
||||
}
|
||||
if(i<maxLength) {
|
||||
// The new gap is now one of the maxLength largest.
|
||||
// Insert the new gap, moving up smaller ones of the previous
|
||||
// length largest.
|
||||
int32_t j= length<maxLength ? length++ : maxLength-1;
|
||||
while(j>i) {
|
||||
gapStarts[j]=gapStarts[j-1];
|
||||
gapLengths[j]=gapLengths[j-1];
|
||||
--j;
|
||||
}
|
||||
gapStarts[i]=gapStart;
|
||||
gapLengths[i]=gapLength;
|
||||
}
|
||||
}
|
||||
|
||||
void truncate(int32_t newLength) {
|
||||
if(newLength<length) {
|
||||
length=newLength;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t count() const { return length; }
|
||||
int32_t gapStart(int32_t i) const { return gapStarts[i]; }
|
||||
int64_t gapLength(int32_t i) const { return gapLengths[i]; }
|
||||
|
||||
int32_t firstAfter(int32_t value) const {
|
||||
if(length==0) {
|
||||
return -1;
|
||||
}
|
||||
int32_t minValue=0;
|
||||
int32_t minIndex=-1;
|
||||
for(int32_t i=0; i<length; ++i) {
|
||||
if(value<gapStarts[i] && (minIndex<0 || gapStarts[i]<minValue)) {
|
||||
minValue=gapStarts[i];
|
||||
minIndex=i;
|
||||
}
|
||||
}
|
||||
return minIndex;
|
||||
}
|
||||
|
||||
private:
|
||||
static const int32_t kCapacity=15;
|
||||
|
||||
int32_t maxLength;
|
||||
int32_t length;
|
||||
int32_t gapStarts[kCapacity];
|
||||
int64_t gapLengths[kCapacity];
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* Does it make sense to write 1..capacity ranges?
|
||||
* Returns 0 if not, otherwise the number of ranges.
|
||||
* @param values Sorted array of signed-integer values.
|
||||
* @param length Number of values.
|
||||
* @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.)
|
||||
* Should be 0x80..0x100, must be 1..0x100.
|
||||
* @param ranges Output ranges array.
|
||||
* @param capacity Maximum number of ranges.
|
||||
* @return Minimum number of ranges (at most capacity) that have the desired density,
|
||||
* or 0 if that density cannot be achieved.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uprv_makeDenseRanges(const int32_t values[], int32_t length,
|
||||
int32_t density,
|
||||
int32_t ranges[][2], int32_t capacity) {
|
||||
if(length<=2) {
|
||||
return 0;
|
||||
}
|
||||
int32_t minValue=values[0];
|
||||
int32_t maxValue=values[length-1]; // Assume minValue<=maxValue.
|
||||
// Use int64_t variables for intermediate-value precision and to avoid
|
||||
// signed-int32_t overflow of maxValue-minValue.
|
||||
int64_t maxLength=(int64_t)maxValue-(int64_t)minValue+1;
|
||||
if(length>=(density*maxLength)/0x100) {
|
||||
// Use one range.
|
||||
ranges[0][0]=minValue;
|
||||
ranges[0][1]=maxValue;
|
||||
return 1;
|
||||
}
|
||||
if(length<=4) {
|
||||
return 0;
|
||||
}
|
||||
// See if we can split [minValue, maxValue] into 2..capacity ranges,
|
||||
// divided by the 1..(capacity-1) largest gaps.
|
||||
LargestGaps gaps(capacity-1);
|
||||
int32_t i;
|
||||
int32_t expectedValue=minValue;
|
||||
for(i=1; i<length; ++i) {
|
||||
++expectedValue;
|
||||
int32_t actualValue=values[i];
|
||||
if(expectedValue!=actualValue) {
|
||||
gaps.add(expectedValue, (int64_t)actualValue-(int64_t)expectedValue);
|
||||
expectedValue=actualValue;
|
||||
}
|
||||
}
|
||||
// We know gaps.count()>=1 because we have fewer values (length) than
|
||||
// the length of the [minValue..maxValue] range (maxLength).
|
||||
// (Otherwise we would have returned with the one range above.)
|
||||
int32_t num;
|
||||
for(i=0, num=2;; ++i, ++num) {
|
||||
if(i>=gaps.count()) {
|
||||
// The values are too sparse for capacity or fewer ranges
|
||||
// of the requested density.
|
||||
return 0;
|
||||
}
|
||||
maxLength-=gaps.gapLength(i);
|
||||
if(length>num*2 && length>=(density*maxLength)/0x100) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Use the num ranges with the num-1 largest gaps.
|
||||
gaps.truncate(num-1);
|
||||
ranges[0][0]=minValue;
|
||||
for(i=0; i<=num-2; ++i) {
|
||||
int32_t gapIndex=gaps.firstAfter(minValue);
|
||||
int32_t gapStart=gaps.gapStart(gapIndex);
|
||||
ranges[i][1]=gapStart-1;
|
||||
ranges[i+1][0]=minValue=(int32_t)(gapStart+gaps.gapLength(gapIndex));
|
||||
}
|
||||
ranges[num-1][1]=maxValue;
|
||||
return num;
|
||||
}
|
39
icu4c/source/tools/toolutil/denseranges.h
Normal file
39
icu4c/source/tools/toolutil/denseranges.h
Normal file
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: denseranges.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010sep25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Helper code for finding a small number of dense ranges.
|
||||
*/
|
||||
|
||||
#ifndef __DENSERANGES_H__
|
||||
#define __DENSERANGES_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
/**
|
||||
* Does it make sense to write 1..capacity ranges?
|
||||
* Returns 0 if not, otherwise the number of ranges.
|
||||
* @param values Sorted array of signed-integer values.
|
||||
* @param length Number of values.
|
||||
* @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.)
|
||||
* Should be 0x80..0x100, must be 1..0x100.
|
||||
* @param ranges Output ranges array.
|
||||
* @param capacity Maximum number of ranges.
|
||||
* @return Minimum number of ranges (at most capacity) that have the desired density,
|
||||
* or 0 if that density cannot be achieved.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uprv_makeDenseRanges(const int32_t values[], int32_t length,
|
||||
int32_t density,
|
||||
int32_t ranges[][2], int32_t capacity);
|
||||
|
||||
#endif // __DENSERANGES_H__
|
267
icu4c/source/tools/toolutil/dicttriebuilder.cpp
Normal file
267
icu4c/source/tools/toolutil/dicttriebuilder.cpp
Normal file
|
@ -0,0 +1,267 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: dicttriebuilder.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010dec24
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Base class for dictionary-trie builder classes.
|
||||
*/
|
||||
|
||||
#include <typeinfo> // for 'typeid' to work
|
||||
#include "unicode/utypes.h"
|
||||
#include "dicttriebuilder.h"
|
||||
#include "uassert.h"
|
||||
#include "uhash.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
hashDictTrieNode(const UHashTok key) {
|
||||
return U_NAMESPACE_QUALIFIER DictTrieBuilder::hashNode(key.pointer);
|
||||
}
|
||||
|
||||
static UBool U_CALLCONV
|
||||
equalDictTrieNodes(const UHashTok key1, const UHashTok key2) {
|
||||
return U_NAMESPACE_QUALIFIER DictTrieBuilder::equalNodes(key1.pointer, key2.pointer);
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
DictTrieBuilder::DictTrieBuilder() : nodes(NULL) {}
|
||||
|
||||
DictTrieBuilder::~DictTrieBuilder() {
|
||||
deleteCompactBuilder();
|
||||
}
|
||||
|
||||
void
|
||||
DictTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
nodes=uhash_openSize(hashDictTrieNode, equalDictTrieNodes, NULL,
|
||||
sizeGuess, &errorCode);
|
||||
if(U_SUCCESS(errorCode) && nodes==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
uhash_setKeyDeleter(nodes, uhash_deleteUObject);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
DictTrieBuilder::deleteCompactBuilder() {
|
||||
uhash_close(nodes);
|
||||
nodes=NULL;
|
||||
}
|
||||
|
||||
DictTrieBuilder::Node *
|
||||
DictTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
delete newNode;
|
||||
return NULL;
|
||||
}
|
||||
if(newNode==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
const UHashElement *old=uhash_find(nodes, newNode);
|
||||
if(old!=NULL) {
|
||||
delete newNode;
|
||||
return (Node *)old->key.pointer;
|
||||
}
|
||||
// If uhash_puti() returns a non-zero value from an equivalent, previously
|
||||
// registered node, then uhash_find() failed to find that and we will leak newNode.
|
||||
#if !U_RELEASE
|
||||
int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
|
||||
#endif
|
||||
uhash_puti(nodes, newNode, 1, &errorCode);
|
||||
U_ASSERT(oldValue==0);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
delete newNode;
|
||||
return NULL;
|
||||
}
|
||||
return newNode;
|
||||
}
|
||||
|
||||
DictTrieBuilder::Node *
|
||||
DictTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
FinalValueNode key(value);
|
||||
const UHashElement *old=uhash_find(nodes, &key);
|
||||
if(old!=NULL) {
|
||||
return (Node *)old->key.pointer;
|
||||
}
|
||||
Node *newNode=createFinalValueNode(value);
|
||||
if(newNode==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
// If uhash_puti() returns a non-zero value from an equivalent, previously
|
||||
// registered node, then uhash_find() failed to find that and we will leak newNode.
|
||||
#if !U_RELEASE
|
||||
int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
|
||||
#endif
|
||||
uhash_puti(nodes, newNode, 1, &errorCode);
|
||||
U_ASSERT(oldValue==0);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
delete newNode;
|
||||
return NULL;
|
||||
}
|
||||
return newNode;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::hashNode(const void *node) {
|
||||
return ((const Node *)node)->hashCode();
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::equalNodes(const void *left, const void *right) {
|
||||
return *(const Node *)left==*(const Node *)right;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::Node::operator==(const Node &other) const {
|
||||
return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber;
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(DictTrieBuilder::Node)
|
||||
|
||||
UBool DictTrieBuilder::FinalValueNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
// Not:
|
||||
// if(!Node::operator==(other)) {
|
||||
// return FALSE;
|
||||
// }
|
||||
// because registerFinalValue() compares a stack-allocated FinalValueNode
|
||||
// (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
|
||||
// with the specific builder's subclass of FinalValueNode,
|
||||
// and !Node::operator==(other) will always be false for that because it
|
||||
// compares the typeid's.
|
||||
// This workaround assumes that the subclass does not add fields that need to be compared.
|
||||
if(hash!=other.hashCode()) {
|
||||
return FALSE;
|
||||
}
|
||||
const FinalValueNode *o=dynamic_cast<const FinalValueNode *>(&other);
|
||||
return o!=NULL && value==o->value;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::ValueNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!Node::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const ValueNode &o=(const ValueNode &)other;
|
||||
return hasValue==o.hasValue && (!hasValue || value==o.value);
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!ValueNode::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const LinearMatchNode &o=(const LinearMatchNode &)other;
|
||||
return length==o.length && next==o.next;
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!Node::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const ListBranchNode &o=(const ListBranchNode &)other;
|
||||
for(int32_t i=0; i<length; ++i) {
|
||||
if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
firstEdgeNumber=edgeNumber;
|
||||
int32_t step=0;
|
||||
int32_t i=length;
|
||||
do {
|
||||
Node *edge=equal[--i];
|
||||
if(edge!=NULL) {
|
||||
edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
|
||||
}
|
||||
// For all but the rightmost edge, decrement the edge number.
|
||||
step=1;
|
||||
} while(i>0);
|
||||
offset=edgeNumber;
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!Node::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const SplitBranchNode &o=(const SplitBranchNode &)other;
|
||||
return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
firstEdgeNumber=edgeNumber;
|
||||
edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
|
||||
offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!ValueNode::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const BranchHeadNode &o=(const BranchHeadNode &)other;
|
||||
return length==o.length && next==o.next;
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
251
icu4c/source/tools/toolutil/dicttriebuilder.h
Normal file
251
icu4c/source/tools/toolutil/dicttriebuilder.h
Normal file
|
@ -0,0 +1,251 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: dicttriebuilder.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010dec24
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Base class for dictionary-trie builder classes.
|
||||
*/
|
||||
|
||||
#ifndef __DICTTRIEBUILDER_H__
|
||||
#define __DICTTRIEBUILDER_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "uhash.h"
|
||||
|
||||
enum UDictTrieBuildOption {
|
||||
UDICTTRIE_BUILD_FAST,
|
||||
UDICTTRIE_BUILD_SMALL
|
||||
};
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class U_TOOLUTIL_API DictTrieBuilder : public UMemory {
|
||||
public:
|
||||
/** @internal */
|
||||
static UBool hashNode(const void *node);
|
||||
/** @internal */
|
||||
static UBool equalNodes(const void *left, const void *right);
|
||||
|
||||
protected:
|
||||
DictTrieBuilder();
|
||||
~DictTrieBuilder();
|
||||
|
||||
class Node;
|
||||
|
||||
void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
|
||||
void deleteCompactBuilder();
|
||||
|
||||
/**
|
||||
* Makes sure that there is only one unique node registered that is
|
||||
* equivalent to newNode.
|
||||
* @param newNode Input node. The builder takes ownership.
|
||||
* @param errorCode ICU in/out UErrorCode.
|
||||
Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
|
||||
* @return newNode if it is the first of its kind, or
|
||||
* an equivalent node if newNode is a duplicate.
|
||||
*/
|
||||
Node *registerNode(Node *newNode, UErrorCode &errorCode);
|
||||
/**
|
||||
* Makes sure that there is only one unique FinalValueNode registered
|
||||
* with this value.
|
||||
* Avoids creating a node if the value is a duplicate.
|
||||
* @param value A final value.
|
||||
* @param errorCode ICU in/out UErrorCode.
|
||||
Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
|
||||
* @return A FinalValueNode with the given value.
|
||||
*/
|
||||
Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
|
||||
|
||||
/*
|
||||
* C++ note:
|
||||
* registerNode() and registerFinalValue() take ownership of their input nodes,
|
||||
* and only return owned nodes.
|
||||
* If they see a failure UErrorCode, they will delete the input node.
|
||||
* If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
|
||||
* If there is a failure, they return NULL.
|
||||
*
|
||||
* NULL Node pointers can be safely passed into other Nodes because
|
||||
* they call the static Node::hashCode() which checks for a NULL pointer first.
|
||||
*
|
||||
* Therefore, as long as builder functions register a new node,
|
||||
* they need to check for failures only before explicitly dereferencing
|
||||
* a Node pointer, or before setting a new UErrorCode.
|
||||
*/
|
||||
|
||||
virtual Node *createFinalValueNode(int32_t value) const = 0;
|
||||
|
||||
// Hash set of nodes, maps from nodes to integer 1.
|
||||
UHashtable *nodes;
|
||||
|
||||
class Node : public UObject {
|
||||
public:
|
||||
Node(int32_t initialHash) : hash(initialHash), offset(0) {}
|
||||
inline int32_t hashCode() const { return hash; }
|
||||
// Handles node==NULL.
|
||||
static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
|
||||
// Base class operator==() compares the actual class types.
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
inline UBool operator!=(const Node &other) const { return !operator==(other); }
|
||||
/**
|
||||
* Traverses the Node graph and numbers branch edges, with rightmost edges first.
|
||||
* This is to avoid writing a duplicate node twice.
|
||||
*
|
||||
* Branch nodes in this trie data structure are not symmetric.
|
||||
* Most branch edges "jump" to other nodes but the rightmost branch edges
|
||||
* just continue without a jump.
|
||||
* Therefore, write() must write the rightmost branch edge last
|
||||
* (trie units are written backwards), and must write it at that point even if
|
||||
* it is a duplicate of a node previously written elsewhere.
|
||||
*
|
||||
* This function visits and marks right branch edges first.
|
||||
* Edges are numbered with increasingly negative values because we share the
|
||||
* offset field which gets positive values when nodes are written.
|
||||
* A branch edge also remembers the first number for any of its edges.
|
||||
*
|
||||
* When a further-left branch edge has a number in the range of the rightmost
|
||||
* edge's numbers, then it will be written as part of the required right edge
|
||||
* and we can avoid writing it first.
|
||||
*
|
||||
* After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
|
||||
* edge numbers.
|
||||
*
|
||||
* @param edgeNumber The first edge number for this node and its sub-nodes.
|
||||
* @return An edge number that is at least the maximum-negative
|
||||
* of the input edge number and the numbers of this node and all of its sub-nodes.
|
||||
*/
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
// write() must set the offset to a positive value.
|
||||
virtual void write(DictTrieBuilder &builder) = 0;
|
||||
// See markRightEdgesFirst.
|
||||
inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
|
||||
DictTrieBuilder &builder) {
|
||||
// Note: Edge numbers are negative, lastRight<=firstRight.
|
||||
// If offset>0 then this node and its sub-nodes have been written already
|
||||
// and we need not write them again.
|
||||
// If this node is part of the unwritten right branch edge,
|
||||
// then we wait until that is written.
|
||||
if(offset<0 && (offset<lastRight || firstRight<offset)) {
|
||||
write(builder);
|
||||
}
|
||||
}
|
||||
inline int32_t getOffset() const { return offset; }
|
||||
protected:
|
||||
int32_t hash;
|
||||
int32_t offset;
|
||||
private:
|
||||
// No ICU "poor man's RTTI" for this class nor its subclasses.
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
};
|
||||
|
||||
class FinalValueNode : public Node {
|
||||
public:
|
||||
FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
// Dummy default implementation, must be overridden for real writing.
|
||||
virtual void write(DictTrieBuilder & /*builder*/) {}
|
||||
protected:
|
||||
int32_t value;
|
||||
};
|
||||
|
||||
class ValueNode : public Node {
|
||||
public:
|
||||
ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
void setValue(int32_t v) {
|
||||
hasValue=TRUE;
|
||||
value=v;
|
||||
hash=hash*37+v;
|
||||
}
|
||||
protected:
|
||||
UBool hasValue;
|
||||
int32_t value;
|
||||
};
|
||||
|
||||
class LinearMatchNode : public ValueNode {
|
||||
public:
|
||||
LinearMatchNode(int32_t len, Node *nextNode)
|
||||
: ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
|
||||
length(len), next(nextNode) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
protected:
|
||||
int32_t length;
|
||||
Node *next;
|
||||
};
|
||||
|
||||
class BranchNode : public Node {
|
||||
public:
|
||||
BranchNode(int32_t initialHash) : Node(initialHash) {}
|
||||
protected:
|
||||
int32_t firstEdgeNumber;
|
||||
};
|
||||
|
||||
class ListBranchNode : public BranchNode {
|
||||
public:
|
||||
ListBranchNode() : BranchNode(0x444444), length(0) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
// Adds a unit with a final value.
|
||||
void add(int32_t c, int32_t value) {
|
||||
units[length]=(UChar)c;
|
||||
equal[length]=NULL;
|
||||
values[length]=value;
|
||||
++length;
|
||||
hash=(hash*37+c)*37+value;
|
||||
}
|
||||
// Adds a unit which leads to another match node.
|
||||
void add(int32_t c, Node *node) {
|
||||
units[length]=(UChar)c;
|
||||
equal[length]=node;
|
||||
values[length]=0;
|
||||
++length;
|
||||
hash=(hash*37+c)*37+hashCode(node);
|
||||
}
|
||||
protected:
|
||||
// TODO: 10 -> max(BT/UCT max list lengths)
|
||||
Node *equal[10]; // NULL means "has final value".
|
||||
int32_t length;
|
||||
int32_t values[10];
|
||||
UChar units[10];
|
||||
};
|
||||
|
||||
class SplitBranchNode : public BranchNode {
|
||||
public:
|
||||
SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
|
||||
: BranchNode(((0x555555*37+middleUnit)*37+
|
||||
hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
|
||||
unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
protected:
|
||||
UChar unit;
|
||||
Node *lessThan;
|
||||
Node *greaterOrEqual;
|
||||
};
|
||||
|
||||
// Branch head node, for writing the actual node lead unit.
|
||||
class BranchHeadNode : public ValueNode {
|
||||
public:
|
||||
BranchHeadNode(int32_t len, Node *subNode)
|
||||
: ValueNode((0x666666*37+len)*37+hashCode(subNode)),
|
||||
length(len), next(subNode) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
protected:
|
||||
int32_t length;
|
||||
Node *next; // A branch sub-node.
|
||||
};
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __DICTTRIEBUILDER_H__
|
|
@ -68,9 +68,94 @@
|
|||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
/* Unicode property (value) aliases data swapping --------------------------- */
|
||||
|
||||
static int32_t
|
||||
upname_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode) {
|
||||
/* udata_swapDataHeader checks the arguments */
|
||||
int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* check data format and format version */
|
||||
const UDataInfo *pInfo=
|
||||
reinterpret_cast<const UDataInfo *>(
|
||||
reinterpret_cast<const char *>(inData)+4);
|
||||
if(!(
|
||||
pInfo->dataFormat[0]==0x70 && /* dataFormat="pnam" */
|
||||
pInfo->dataFormat[1]==0x6e &&
|
||||
pInfo->dataFormat[2]==0x61 &&
|
||||
pInfo->dataFormat[3]==0x6d &&
|
||||
pInfo->formatVersion[0]==2
|
||||
)) {
|
||||
udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1],
|
||||
pInfo->dataFormat[2], pInfo->dataFormat[3],
|
||||
pInfo->formatVersion[0]);
|
||||
*pErrorCode=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
const uint8_t *inBytes=reinterpret_cast<const uint8_t *>(inData)+headerSize;
|
||||
uint8_t *outBytes=reinterpret_cast<uint8_t *>(outData)+headerSize;
|
||||
|
||||
if(length>=0) {
|
||||
length-=headerSize;
|
||||
// formatVersion 2 initially has indexes[8], 32 bytes.
|
||||
if(length<32) {
|
||||
udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n",
|
||||
(int)length);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
const int32_t *inIndexes=reinterpret_cast<const int32_t *>(inBytes);
|
||||
int32_t totalSize=udata_readInt32(ds, inIndexes[PropNameData::IX_TOTAL_SIZE]);
|
||||
if(length>=0) {
|
||||
if(length<totalSize) {
|
||||
udata_printError(ds, "upname_swap(): too few bytes (%d after header, should be %d) "
|
||||
"for pnames.icu\n",
|
||||
(int)length, (int)totalSize);
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t numBytesIndexesAndValueMaps=
|
||||
udata_readInt32(ds, inIndexes[PropNameData::IX_BYTE_TRIES_OFFSET]);
|
||||
|
||||
// Swap the indexes[] and the valueMaps[].
|
||||
ds->swapArray32(ds, inBytes, numBytesIndexesAndValueMaps, outBytes, pErrorCode);
|
||||
|
||||
// Copy the rest of the data.
|
||||
if(inBytes!=outBytes) {
|
||||
uprv_memcpy(outBytes+numBytesIndexesAndValueMaps,
|
||||
inBytes+numBytesIndexesAndValueMaps,
|
||||
totalSize-numBytesIndexesAndValueMaps);
|
||||
}
|
||||
|
||||
// We need not swap anything else:
|
||||
//
|
||||
// The ByteTries are already byte-serialized, and are fixed on ASCII.
|
||||
// (On an EBCDIC machine, the input string is converted to lowercase ASCII
|
||||
// while matching.)
|
||||
//
|
||||
// The name groups are mostly invariant characters, but since we only
|
||||
// generate, and keep in subversion, ASCII versions of pnames.icu,
|
||||
// and since only ICU4J uses the pnames.icu data file
|
||||
// (the data is hardcoded in ICU4C) and ICU4J uses ASCII data files,
|
||||
// we just copy those bytes too.
|
||||
}
|
||||
|
||||
return headerSize+totalSize;
|
||||
}
|
||||
|
||||
/* Unicode properties data swapping ----------------------------------------- */
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
static int32_t
|
||||
uprops_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode) {
|
||||
|
@ -218,7 +303,7 @@ uprops_swap(const UDataSwapper *ds,
|
|||
|
||||
/* Unicode case mapping data swapping --------------------------------------- */
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
static int32_t
|
||||
ucase_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode) {
|
||||
|
@ -320,7 +405,7 @@ ucase_swap(const UDataSwapper *ds,
|
|||
|
||||
/* Unicode bidi/shaping data swapping --------------------------------------- */
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
static int32_t
|
||||
ubidi_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode) {
|
||||
|
@ -428,7 +513,7 @@ ubidi_swap(const UDataSwapper *ds,
|
|||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
static int32_t
|
||||
unorm_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode) {
|
||||
|
@ -552,7 +637,7 @@ unorm_swap(const UDataSwapper *ds,
|
|||
#endif
|
||||
|
||||
/* Swap 'Test' data from gentest */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
static int32_t
|
||||
test_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode) {
|
||||
|
|
|
@ -246,6 +246,10 @@
|
|||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="bytetriebuilder.cpp" />
|
||||
<ClCompile Include="bytetrieiterator.cpp" />
|
||||
<ClCompile Include="denseranges.cpp" />
|
||||
<ClCompile Include="dicttriebuilder.cpp" />
|
||||
<ClCompile Include="filestrm.c" />
|
||||
<ClCompile Include="filetools.cpp" />
|
||||
<ClCompile Include="flagparser.c" />
|
||||
|
@ -272,6 +276,9 @@
|
|||
<DisableLanguageExtensions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</DisableLanguageExtensions>
|
||||
</ClCompile>
|
||||
<ClCompile Include="ucbuf.c" />
|
||||
<ClCompile Include="uchartrie.cpp" />
|
||||
<ClCompile Include="uchartriebuilder.cpp" />
|
||||
<ClCompile Include="uchartrieiterator.cpp" />
|
||||
<ClCompile Include="ucm.c" />
|
||||
<ClCompile Include="ucmstate.c" />
|
||||
<ClCompile Include="unewdata.c" />
|
||||
|
@ -289,6 +296,10 @@
|
|||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="bytetriebuilder.h" />
|
||||
<ClInclude Include="bytetrieiterator.h" />
|
||||
<ClInclude Include="denseranges.h" />
|
||||
<ClInclude Include="dicttriebuilder.h" />
|
||||
<ClInclude Include="filestrm.h" />
|
||||
<ClInclude Include="filetools.h" />
|
||||
<ClInclude Include="flagparser.h" />
|
||||
|
@ -301,6 +312,9 @@
|
|||
<ClInclude Include="swapimpl.h" />
|
||||
<ClInclude Include="toolutil.h" />
|
||||
<ClInclude Include="ucbuf.h" />
|
||||
<ClInclude Include="uchartrie.h" />
|
||||
<ClInclude Include="uchartriebuilder.h" />
|
||||
<ClInclude Include="uchartrieiterator.h" />
|
||||
<ClInclude Include="ucm.h" />
|
||||
<ClInclude Include="unewdata.h" />
|
||||
<ClInclude Include="uoptions.h" />
|
||||
|
@ -323,4 +337,4 @@
|
|||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
||||
</Project>
|
||||
|
|
414
icu4c/source/tools/toolutil/uchartrie.cpp
Normal file
414
icu4c/source/tools/toolutil/uchartrie.cpp
Normal file
|
@ -0,0 +1,414 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uchartrie.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov14
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "uassert.h"
|
||||
#include "uchartrie.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
Appendable &
|
||||
Appendable::appendCodePoint(UChar32 c) {
|
||||
if(c<=0xffff) {
|
||||
return append((UChar)c);
|
||||
} else {
|
||||
return append(U16_LEAD(c)).append(U16_TRAIL(c));
|
||||
}
|
||||
}
|
||||
|
||||
Appendable &
|
||||
Appendable::append(const UChar *s, int32_t length) {
|
||||
if(s!=NULL && length!=0) {
|
||||
if(length<0) {
|
||||
UChar c;
|
||||
while((c=*s++)!=0) {
|
||||
append(c);
|
||||
}
|
||||
} else {
|
||||
const UChar *limit=s+length;
|
||||
while(s<limit) {
|
||||
append(*s++);
|
||||
}
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(Appendable)
|
||||
|
||||
UDictTrieResult
|
||||
UCharTrie::current() const {
|
||||
const UChar *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
} else {
|
||||
int32_t node;
|
||||
return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
}
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
UCharTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
|
||||
// Branch according to the current unit.
|
||||
if(length==0) {
|
||||
length=*pos++;
|
||||
}
|
||||
++length;
|
||||
// The length of the branch is the number of units to select from.
|
||||
// The data structure encodes a binary search.
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
if(uchar<*pos++) {
|
||||
length>>=1;
|
||||
pos=jumpByDelta(pos);
|
||||
} else {
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
}
|
||||
// Drop down to linear search for the last few units.
|
||||
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
|
||||
// and divides length by 2.
|
||||
do {
|
||||
if(uchar==*pos++) {
|
||||
UDictTrieResult result;
|
||||
int32_t node=*pos;
|
||||
if(node&kValueIsFinal) {
|
||||
// Leave the final value for getValue() to read.
|
||||
result=UDICTTRIE_HAS_FINAL_VALUE;
|
||||
} else {
|
||||
// Use the non-final value as the jump delta.
|
||||
++pos;
|
||||
// int32_t delta=readValue(pos, node>>1);
|
||||
int32_t delta;
|
||||
if(node<kMinTwoUnitValueLead) {
|
||||
delta=node;
|
||||
} else if(node<kThreeUnitValueLead) {
|
||||
delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
|
||||
} else {
|
||||
delta=(pos[0]<<16)|pos[1];
|
||||
pos+=2;
|
||||
}
|
||||
// end readValue()
|
||||
pos+=delta;
|
||||
node=*pos;
|
||||
result= node>=kMinValueLead ? valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
}
|
||||
pos_=pos;
|
||||
return result;
|
||||
}
|
||||
--length;
|
||||
pos=skipValue(pos);
|
||||
} while(length>1);
|
||||
if(uchar==*pos++) {
|
||||
pos_=pos;
|
||||
int32_t node=*pos;
|
||||
return node>=kMinValueLead ? valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
} else {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
UCharTrie::nextImpl(const UChar *pos, int32_t uchar) {
|
||||
int32_t node=*pos++;
|
||||
for(;;) {
|
||||
if(node<kMinLinearMatch) {
|
||||
return branchNext(pos, node, uchar);
|
||||
} else if(node<kMinValueLead) {
|
||||
// Match the first of length+1 units.
|
||||
int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
|
||||
if(uchar==*pos++) {
|
||||
remainingMatchLength_=--length;
|
||||
pos_=pos;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
} else {
|
||||
// No match.
|
||||
break;
|
||||
}
|
||||
} else if(node&kValueIsFinal) {
|
||||
// No further matching units.
|
||||
break;
|
||||
} else {
|
||||
// Skip intermediate value.
|
||||
pos=skipNodeValue(pos, node);
|
||||
node&=kNodeTypeMask;
|
||||
}
|
||||
}
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
UCharTrie::next(int32_t uchar) {
|
||||
const UChar *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
if(length>=0) {
|
||||
// Remaining part of a linear-match node.
|
||||
if(uchar==*pos++) {
|
||||
remainingMatchLength_=--length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
} else {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
}
|
||||
return nextImpl(pos, uchar);
|
||||
}
|
||||
|
||||
UDictTrieResult
|
||||
UCharTrie::next(const UChar *s, int32_t sLength) {
|
||||
if(sLength<0 ? *s==0 : sLength==0) {
|
||||
// Empty input.
|
||||
return current();
|
||||
}
|
||||
const UChar *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
for(;;) {
|
||||
// Fetch the next input unit, if there is one.
|
||||
// Continue a linear-match node without rechecking sLength<0.
|
||||
int32_t uchar;
|
||||
if(sLength<0) {
|
||||
for(;;) {
|
||||
if((uchar=*s++)==0) {
|
||||
remainingMatchLength_=length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
}
|
||||
if(length<0) {
|
||||
remainingMatchLength_=length;
|
||||
break;
|
||||
}
|
||||
if(uchar!=*pos) {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
}
|
||||
} else {
|
||||
for(;;) {
|
||||
if(sLength==0) {
|
||||
remainingMatchLength_=length;
|
||||
pos_=pos;
|
||||
int32_t node;
|
||||
return (length<0 && (node=*pos)>=kMinValueLead) ?
|
||||
valueResult(node) : UDICTTRIE_NO_VALUE;
|
||||
}
|
||||
uchar=*s++;
|
||||
--sLength;
|
||||
if(length<0) {
|
||||
remainingMatchLength_=length;
|
||||
break;
|
||||
}
|
||||
if(uchar!=*pos) {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
}
|
||||
}
|
||||
int32_t node=*pos++;
|
||||
for(;;) {
|
||||
if(node<kMinLinearMatch) {
|
||||
UDictTrieResult result=branchNext(pos, node, uchar);
|
||||
if(result==UDICTTRIE_NO_MATCH) {
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
// Fetch the next input unit, if there is one.
|
||||
if(sLength<0) {
|
||||
if((uchar=*s++)==0) {
|
||||
return result;
|
||||
}
|
||||
} else {
|
||||
if(sLength==0) {
|
||||
return result;
|
||||
}
|
||||
uchar=*s++;
|
||||
--sLength;
|
||||
}
|
||||
if(result==UDICTTRIE_HAS_FINAL_VALUE) {
|
||||
// No further matching units.
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
|
||||
node=*pos++;
|
||||
} else if(node<kMinValueLead) {
|
||||
// Match length+1 units.
|
||||
length=node-kMinLinearMatch; // Actual match length minus 1.
|
||||
if(uchar!=*pos) {
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
}
|
||||
++pos;
|
||||
--length;
|
||||
break;
|
||||
} else if(node&kValueIsFinal) {
|
||||
// No further matching units.
|
||||
stop();
|
||||
return UDICTTRIE_NO_MATCH;
|
||||
} else {
|
||||
// Skip intermediate value.
|
||||
pos=skipNodeValue(pos, node);
|
||||
node&=kNodeTypeMask;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const UChar *
|
||||
UCharTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
|
||||
UBool haveUniqueValue, int32_t &uniqueValue) {
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison byte
|
||||
if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
|
||||
return NULL;
|
||||
}
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
do {
|
||||
++pos; // ignore a comparison unit
|
||||
// handle its value
|
||||
int32_t node=*pos++;
|
||||
UBool isFinal=(UBool)(node>>15);
|
||||
node&=0x7fff;
|
||||
int32_t value=readValue(pos, node);
|
||||
pos=skipValue(pos, node);
|
||||
if(isFinal) {
|
||||
if(haveUniqueValue) {
|
||||
if(value!=uniqueValue) {
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
uniqueValue=value;
|
||||
haveUniqueValue=TRUE;
|
||||
}
|
||||
} else {
|
||||
if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
|
||||
return NULL;
|
||||
}
|
||||
haveUniqueValue=TRUE;
|
||||
}
|
||||
} while(--length>1);
|
||||
return pos+1; // ignore the last comparison unit
|
||||
}
|
||||
|
||||
UBool
|
||||
UCharTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
|
||||
int32_t node=*pos++;
|
||||
for(;;) {
|
||||
if(node<kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
|
||||
if(pos==NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
haveUniqueValue=TRUE;
|
||||
node=*pos++;
|
||||
} else if(node<kMinValueLead) {
|
||||
// linear-match node
|
||||
pos+=node-kMinLinearMatch+1; // Ignore the match units.
|
||||
node=*pos++;
|
||||
} else {
|
||||
UBool isFinal=(UBool)(node>>15);
|
||||
int32_t value;
|
||||
if(isFinal) {
|
||||
value=readValue(pos, node&0x7fff);
|
||||
} else {
|
||||
value=readNodeValue(pos, node);
|
||||
}
|
||||
if(haveUniqueValue) {
|
||||
if(value!=uniqueValue) {
|
||||
return FALSE;
|
||||
}
|
||||
} else {
|
||||
uniqueValue=value;
|
||||
haveUniqueValue=TRUE;
|
||||
}
|
||||
if(isFinal) {
|
||||
return TRUE;
|
||||
}
|
||||
pos=skipNodeValue(pos, node);
|
||||
node&=kNodeTypeMask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrie::getNextUChars(Appendable &out) const {
|
||||
const UChar *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
return 0;
|
||||
}
|
||||
if(remainingMatchLength_>=0) {
|
||||
out.append(*pos); // Next unit of a pending linear-match node.
|
||||
return 1;
|
||||
}
|
||||
int32_t node=*pos++;
|
||||
if(node>=kMinValueLead) {
|
||||
if(node&kValueIsFinal) {
|
||||
return 0;
|
||||
} else {
|
||||
pos=skipNodeValue(pos, node);
|
||||
node&=kNodeTypeMask;
|
||||
}
|
||||
}
|
||||
if(node<kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
getNextBranchUChars(pos, ++node, out);
|
||||
return node;
|
||||
} else {
|
||||
// First unit of the linear-match node.
|
||||
out.append(*pos);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
|
||||
while(length>kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison unit
|
||||
getNextBranchUChars(jumpByDelta(pos), length>>1, out);
|
||||
length=length-(length>>1);
|
||||
pos=skipDelta(pos);
|
||||
}
|
||||
do {
|
||||
out.append(*pos++);
|
||||
pos=skipValue(pos);
|
||||
} while(--length>1);
|
||||
out.append(*pos);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
433
icu4c/source/tools/toolutil/uchartrie.h
Normal file
433
icu4c/source/tools/toolutil/uchartrie.h
Normal file
|
@ -0,0 +1,433 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uchartrie.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov14
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __UCHARTRIE_H__
|
||||
#define __UCHARTRIE_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: Dictionary trie for mapping Unicode strings (or 16-bit-unit sequences)
|
||||
* to integer values.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "uassert.h"
|
||||
#include "udicttrie.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UCharTrieBuilder;
|
||||
class UCharTrieIterator;
|
||||
|
||||
/**
|
||||
* Base class for objects to which Unicode characters and strings can be appended.
|
||||
* Combines elements of Java Appendable and ICU4C ByteSink.
|
||||
* TODO: Should live in separate files, could be public API.
|
||||
*/
|
||||
class U_TOOLUTIL_API Appendable : public UObject {
|
||||
public:
|
||||
/**
|
||||
* Appends a 16-bit code unit.
|
||||
* @param c code unit
|
||||
* @return *this
|
||||
*/
|
||||
virtual Appendable &append(UChar c) = 0;
|
||||
/**
|
||||
* Appends a code point; has a default implementation.
|
||||
* @param c code point
|
||||
* @return *this
|
||||
*/
|
||||
virtual Appendable &appendCodePoint(UChar32 c);
|
||||
/**
|
||||
* Appends a string; has a default implementation.
|
||||
* @param s string
|
||||
* @param length string length, or -1 if NUL-terminated
|
||||
* @return *this
|
||||
*/
|
||||
virtual Appendable &append(const UChar *s, int32_t length);
|
||||
|
||||
// TODO: getAppendBuffer(), see ByteSink
|
||||
// TODO: flush() (?) see ByteSink
|
||||
|
||||
private:
|
||||
// No ICU "poor man's RTTI" for this class nor its subclasses.
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
};
|
||||
|
||||
/**
|
||||
* Light-weight, non-const reader class for a UCharTrie.
|
||||
* Traverses a UChar-serialized data structure with minimal state,
|
||||
* for mapping strings (16-bit-unit sequences) to non-negative integer values.
|
||||
*/
|
||||
class U_TOOLUTIL_API UCharTrie : public UMemory {
|
||||
public:
|
||||
UCharTrie(const UChar *trieUChars)
|
||||
: uchars_(trieUChars),
|
||||
pos_(uchars_), remainingMatchLength_(-1) {}
|
||||
|
||||
/**
|
||||
* Resets this trie to its initial state.
|
||||
*/
|
||||
UCharTrie &reset() {
|
||||
pos_=uchars_;
|
||||
remainingMatchLength_=-1;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* UCharTrie state object, for saving a trie's current state
|
||||
* and resetting the trie back to this state later.
|
||||
*/
|
||||
class State : public UMemory {
|
||||
public:
|
||||
State() { uchars=NULL; }
|
||||
private:
|
||||
friend class UCharTrie;
|
||||
|
||||
const UChar *uchars;
|
||||
const UChar *pos;
|
||||
int32_t remainingMatchLength;
|
||||
};
|
||||
|
||||
/**
|
||||
* Saves the state of this trie.
|
||||
* @see resetToState
|
||||
*/
|
||||
const UCharTrie &saveState(State &state) const {
|
||||
state.uchars=uchars_;
|
||||
state.pos=pos_;
|
||||
state.remainingMatchLength=remainingMatchLength_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets this trie to the saved state.
|
||||
* If the state object contains no state, or the state of a different trie,
|
||||
* then this trie remains unchanged.
|
||||
* @see saveState
|
||||
* @see reset
|
||||
*/
|
||||
UCharTrie &resetToState(const State &state) {
|
||||
if(uchars_==state.uchars && uchars_!=NULL) {
|
||||
pos_=state.pos;
|
||||
remainingMatchLength_=state.remainingMatchLength;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the string so far matches, whether it has a value,
|
||||
* and whether another input UChar can continue a matching string.
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
UDictTrieResult current() const;
|
||||
|
||||
/**
|
||||
* Traverses the trie from the initial state for this input UChar.
|
||||
* Equivalent to reset().next(uchar).
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
inline UDictTrieResult first(int32_t uchar) {
|
||||
remainingMatchLength_=-1;
|
||||
return nextImpl(uchars_, uchar);
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverses the trie from the initial state for the
|
||||
* one or two UTF-16 code units for this input code point.
|
||||
* Equivalent to reset().nextForCodePoint(cp).
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
inline UDictTrieResult firstForCodePoint(UChar32 cp) {
|
||||
return cp<=0xffff ?
|
||||
first(cp) :
|
||||
(first(U16_LEAD(cp))!=UDICTTRIE_NO_MATCH ?
|
||||
next(U16_TRAIL(cp)) :
|
||||
UDICTTRIE_NO_MATCH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverses the trie from the current state for this input UChar.
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
UDictTrieResult next(int32_t uchar);
|
||||
|
||||
/**
|
||||
* Traverses the trie from the current state for the
|
||||
* one or two UTF-16 code units for this input code point.
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
inline UDictTrieResult nextForCodePoint(UChar32 cp) {
|
||||
return cp<=0xffff ?
|
||||
next(cp) :
|
||||
(next(U16_LEAD(cp))!=UDICTTRIE_NO_MATCH ?
|
||||
next(U16_TRAIL(cp)) :
|
||||
UDICTTRIE_NO_MATCH);
|
||||
}
|
||||
|
||||
/**
|
||||
* Traverses the trie from the current state for this string.
|
||||
* Equivalent to
|
||||
* \code
|
||||
* Result result=current();
|
||||
* for(each c in s)
|
||||
* if((result=next(c))==UDICTTRIE_NO_MATCH) return UDICTTRIE_NO_MATCH;
|
||||
* return result;
|
||||
* \endcode
|
||||
* @return The match/value Result.
|
||||
*/
|
||||
UDictTrieResult next(const UChar *s, int32_t length);
|
||||
|
||||
/**
|
||||
* Returns a matching string's value if called immediately after
|
||||
* current()/first()/next() returned UDICTTRIE_HAS_VALUE or UDICTTRIE_HAS_FINAL_VALUE.
|
||||
* getValue() can be called multiple times.
|
||||
*
|
||||
* Do not call getValue() after UDICTTRIE_NO_MATCH or UDICTTRIE_NO_VALUE!
|
||||
*/
|
||||
inline int32_t getValue() const {
|
||||
const UChar *pos=pos_;
|
||||
int32_t leadUnit=*pos++;
|
||||
U_ASSERT(leadUnit>=kMinValueLead);
|
||||
return leadUnit&kValueIsFinal ?
|
||||
readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether all strings reachable from the current state
|
||||
* map to the same value.
|
||||
* @param uniqueValue Receives the unique value, if this function returns TRUE.
|
||||
* (output-only)
|
||||
* @return TRUE if all strings reachable from the current state
|
||||
* map to the same value.
|
||||
*/
|
||||
inline UBool hasUniqueValue(int32_t &uniqueValue) const {
|
||||
const UChar *pos=pos_;
|
||||
// Skip the rest of a pending linear-match node.
|
||||
return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds each UChar which continues the string from the current state.
|
||||
* That is, each UChar c for which it would be next(c)!=UDICTTRIE_NO_MATCH now.
|
||||
* @param out Each next UChar is appended to this object.
|
||||
* (Only uses the out.append(c) method.)
|
||||
* @return the number of UChars which continue the string from here
|
||||
*/
|
||||
int32_t getNextUChars(Appendable &out) const;
|
||||
|
||||
private:
|
||||
friend class UCharTrieBuilder;
|
||||
friend class UCharTrieIterator;
|
||||
|
||||
inline void stop() {
|
||||
pos_=NULL;
|
||||
}
|
||||
|
||||
// Reads a compact 32-bit integer.
|
||||
// pos is already after the leadUnit, and the lead unit has bit 15 reset.
|
||||
static inline int32_t readValue(const UChar *pos, int32_t leadUnit) {
|
||||
int32_t value;
|
||||
if(leadUnit<kMinTwoUnitValueLead) {
|
||||
value=leadUnit;
|
||||
} else if(leadUnit<kThreeUnitValueLead) {
|
||||
value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
|
||||
} else {
|
||||
value=(pos[0]<<16)|pos[1];
|
||||
}
|
||||
return value;
|
||||
}
|
||||
static inline const UChar *skipValue(const UChar *pos, int32_t leadUnit) {
|
||||
if(leadUnit>=kMinTwoUnitValueLead) {
|
||||
if(leadUnit<kThreeUnitValueLead) {
|
||||
++pos;
|
||||
} else {
|
||||
pos+=2;
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
static inline const UChar *skipValue(const UChar *pos) {
|
||||
int32_t leadUnit=*pos++;
|
||||
return skipValue(pos, leadUnit&0x7fff);
|
||||
}
|
||||
|
||||
static inline int32_t readNodeValue(const UChar *pos, int32_t leadUnit) {
|
||||
U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
|
||||
int32_t value;
|
||||
if(leadUnit<kMinTwoUnitNodeValueLead) {
|
||||
value=(leadUnit>>6)-1;
|
||||
} else if(leadUnit<kThreeUnitNodeValueLead) {
|
||||
value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
|
||||
} else {
|
||||
value=(pos[0]<<16)|pos[1];
|
||||
}
|
||||
return value;
|
||||
}
|
||||
static inline const UChar *skipNodeValue(const UChar *pos, int32_t leadUnit) {
|
||||
U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
|
||||
if(leadUnit>=kMinTwoUnitNodeValueLead) {
|
||||
if(leadUnit<kThreeUnitNodeValueLead) {
|
||||
++pos;
|
||||
} else {
|
||||
pos+=2;
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline const UChar *jumpByDelta(const UChar *pos) {
|
||||
int32_t delta=*pos++;
|
||||
if(delta>=kMinTwoUnitDeltaLead) {
|
||||
if(delta==kThreeUnitDeltaLead) {
|
||||
delta=(pos[0]<<16)|pos[1];
|
||||
pos+=2;
|
||||
} else {
|
||||
delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
|
||||
}
|
||||
}
|
||||
return pos+delta;
|
||||
}
|
||||
|
||||
static const UChar *skipDelta(const UChar *pos) {
|
||||
int32_t delta=*pos++;
|
||||
if(delta>=kMinTwoUnitDeltaLead) {
|
||||
if(delta==kThreeUnitDeltaLead) {
|
||||
pos+=2;
|
||||
} else {
|
||||
++pos;
|
||||
}
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
static inline UDictTrieResult valueResult(int32_t node) {
|
||||
return (UDictTrieResult)(UDICTTRIE_HAS_VALUE-(node>>15));
|
||||
}
|
||||
|
||||
// Handles a branch node for both next(uchar) and next(string).
|
||||
UDictTrieResult branchNext(const UChar *pos, int32_t length, int32_t uchar);
|
||||
|
||||
// Requires remainingLength_<0.
|
||||
UDictTrieResult nextImpl(const UChar *pos, int32_t uchar);
|
||||
|
||||
// Helper functions for hasUniqueValue().
|
||||
// Recursively finds a unique value (or whether there is not a unique one)
|
||||
// from a branch.
|
||||
static const UChar *findUniqueValueFromBranch(const UChar *pos, int32_t length,
|
||||
UBool haveUniqueValue, int32_t &uniqueValue);
|
||||
// Recursively finds a unique value (or whether there is not a unique one)
|
||||
// starting from a position on a node lead unit.
|
||||
static UBool findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue);
|
||||
|
||||
// Helper functions for getNextUChars().
|
||||
// getNextUChars() when pos is on a branch node.
|
||||
static void getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out);
|
||||
|
||||
// UCharTrie data structure
|
||||
//
|
||||
// The trie consists of a series of UChar-serialized nodes for incremental
|
||||
// Unicode string/UChar sequence matching. (UChar=16-bit unsigned integer)
|
||||
// The root node is at the beginning of the trie data.
|
||||
//
|
||||
// Types of nodes are distinguished by their node lead unit ranges.
|
||||
// After each node, except a final-value node, another node follows to
|
||||
// encode match values or continue matching further units.
|
||||
//
|
||||
// Node types:
|
||||
// - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
|
||||
// The value is for the string/UChar sequence so far.
|
||||
// - Match node, optionally with an intermediate value in a different compact format.
|
||||
// The value, if present, is for the string/UChar sequence so far.
|
||||
//
|
||||
// Aside from the value, which uses the node lead unit's high bits:
|
||||
//
|
||||
// - Linear-match node: Matches a number of units.
|
||||
// - Branch node: Branches to other nodes according to the current input unit.
|
||||
// The node unit is the length of the branch (number of units to select from)
|
||||
// minus 1. It is followed by a sub-node:
|
||||
// - If the length is at most kMaxBranchLinearSubNodeLength, then
|
||||
// there are length-1 (key, value) pairs and then one more comparison unit.
|
||||
// If one of the key units matches, then the value is either a final value for
|
||||
// the string so far, or a "jump" delta to the next node.
|
||||
// If the last unit matches, then matching continues with the next node.
|
||||
// (Values have the same encoding as final-value nodes.)
|
||||
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
|
||||
// there is one unit and one "jump" delta.
|
||||
// If the input unit is less than the sub-node unit, then "jump" by delta to
|
||||
// the next sub-node which will have a length of length/2.
|
||||
// (The delta has its own compact encoding.)
|
||||
// Otherwise, skip the "jump" delta to the next sub-node
|
||||
// which will have a length of length-length/2.
|
||||
|
||||
// Match-node lead unit values, after masking off intermediate-value bits:
|
||||
|
||||
// 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
|
||||
// the length is one more than the next unit.
|
||||
|
||||
// For a branch sub-node with at most this many entries, we drop down
|
||||
// to a linear search.
|
||||
static const int32_t kMaxBranchLinearSubNodeLength=5;
|
||||
|
||||
// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
|
||||
static const int32_t kMinLinearMatch=0x30;
|
||||
static const int32_t kMaxLinearMatchLength=0x10;
|
||||
|
||||
// Match-node lead unit bits 14..6 for the optional intermediate value.
|
||||
// If these bits are 0, then there is no intermediate value.
|
||||
// Otherwise, see the *NodeValue* constants below.
|
||||
static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
|
||||
static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
|
||||
|
||||
// A final-value node has bit 15 set.
|
||||
static const int32_t kValueIsFinal=0x8000;
|
||||
|
||||
// Compact value: After testing and masking off bit 15, use the following thresholds.
|
||||
static const int32_t kMaxOneUnitValue=0x3fff;
|
||||
|
||||
static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
|
||||
static const int32_t kThreeUnitValueLead=0x7fff;
|
||||
|
||||
static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
|
||||
|
||||
// Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
|
||||
static const int32_t kMaxOneUnitNodeValue=0xff;
|
||||
static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
|
||||
static const int32_t kThreeUnitNodeValueLead=0x7fc0;
|
||||
|
||||
static const int32_t kMaxTwoUnitNodeValue=
|
||||
((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
|
||||
|
||||
// Compact delta integers.
|
||||
static const int32_t kMaxOneUnitDelta=0xfbff;
|
||||
static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
|
||||
static const int32_t kThreeUnitDeltaLead=0xffff;
|
||||
|
||||
static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
|
||||
|
||||
// Fixed value referencing the UCharTrie words.
|
||||
const UChar *uchars_;
|
||||
|
||||
// Iterator variables.
|
||||
|
||||
// Pointer to next trie unit to read. NULL if no more matches.
|
||||
const UChar *pos_;
|
||||
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
|
||||
int32_t remainingMatchLength_;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __UCHARTRIE_H__
|
696
icu4c/source/tools/toolutil/uchartriebuilder.cpp
Normal file
696
icu4c/source/tools/toolutil/uchartriebuilder.cpp
Normal file
|
@ -0,0 +1,696 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uchartriebuilder.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov14
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Builder class for UCharTrie dictionary trie.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "uarrsort.h"
|
||||
#include "uchartrie.h"
|
||||
#include "uchartriebuilder.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/*
|
||||
* Note: This builder implementation stores (string, value) pairs with full copies
|
||||
* of the 16-bit-unit sequences, until the UCharTrie is built.
|
||||
* It might(!) take less memory if we collected the data in a temporary, dynamic trie.
|
||||
*/
|
||||
|
||||
class UCharTrieElement : public UMemory {
|
||||
public:
|
||||
// Use compiler's default constructor, initializes nothing.
|
||||
|
||||
void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
|
||||
|
||||
UnicodeString getString(const UnicodeString &strings) const {
|
||||
int32_t length=strings[stringOffset];
|
||||
return strings.tempSubString(stringOffset+1, length);
|
||||
}
|
||||
int32_t getStringLength(const UnicodeString &strings) const {
|
||||
return strings[stringOffset];
|
||||
}
|
||||
|
||||
UChar charAt(int32_t index, const UnicodeString &strings) const {
|
||||
return strings[stringOffset+1+index];
|
||||
}
|
||||
|
||||
int32_t getValue() const { return value; }
|
||||
|
||||
int32_t compareStringTo(const UCharTrieElement &o, const UnicodeString &strings) const;
|
||||
|
||||
private:
|
||||
// The first strings unit contains the string length.
|
||||
// (Compared with a stringLength field here, this saves 2 bytes per string.)
|
||||
int32_t stringOffset;
|
||||
int32_t value;
|
||||
};
|
||||
|
||||
void
|
||||
UCharTrieElement::setTo(const UnicodeString &s, int32_t val,
|
||||
UnicodeString &strings, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
int32_t length=s.length();
|
||||
if(length>0xffff) {
|
||||
// Too long: We store the length in 1 unit.
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
stringOffset=strings.length();
|
||||
strings.append((UChar)length);
|
||||
value=val;
|
||||
strings.append(s);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieElement::compareStringTo(const UCharTrieElement &other, const UnicodeString &strings) const {
|
||||
return getString(strings).compare(other.getString(strings));
|
||||
}
|
||||
|
||||
UCharTrieBuilder::~UCharTrieBuilder() {
|
||||
delete[] elements;
|
||||
uprv_free(uchars);
|
||||
}
|
||||
|
||||
UCharTrieBuilder &
|
||||
UCharTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return *this;
|
||||
}
|
||||
if(ucharsLength>0) {
|
||||
// Cannot add elements after building.
|
||||
errorCode=U_NO_WRITE_PERMISSION;
|
||||
return *this;
|
||||
}
|
||||
ucharsCapacity+=s.length()+1; // Crude uchars preallocation estimate.
|
||||
if(elementsLength==elementsCapacity) {
|
||||
int32_t newCapacity;
|
||||
if(elementsCapacity==0) {
|
||||
newCapacity=1024;
|
||||
} else {
|
||||
newCapacity=4*elementsCapacity;
|
||||
}
|
||||
UCharTrieElement *newElements=new UCharTrieElement[newCapacity];
|
||||
if(newElements==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if(elementsLength>0) {
|
||||
uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharTrieElement));
|
||||
}
|
||||
delete[] elements;
|
||||
elements=newElements;
|
||||
elementsCapacity=newCapacity;
|
||||
}
|
||||
elements[elementsLength++].setTo(s, value, strings, errorCode);
|
||||
if(U_SUCCESS(errorCode) && strings.isBogus()) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
compareElementStrings(const void *context, const void *left, const void *right) {
|
||||
const UnicodeString *strings=reinterpret_cast<const UnicodeString *>(context);
|
||||
const UCharTrieElement *leftElement=reinterpret_cast<const UCharTrieElement *>(left);
|
||||
const UCharTrieElement *rightElement=reinterpret_cast<const UCharTrieElement *>(right);
|
||||
return leftElement->compareStringTo(*rightElement, *strings);
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
UnicodeString &
|
||||
UCharTrieBuilder::build(UDictTrieBuildOption buildOption, UnicodeString &result, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return result;
|
||||
}
|
||||
if(ucharsLength>0) {
|
||||
// Already built.
|
||||
result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
|
||||
return result;
|
||||
}
|
||||
if(elementsLength==0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return result;
|
||||
}
|
||||
if(strings.isBogus()) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return result;
|
||||
}
|
||||
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharTrieElement),
|
||||
compareElementStrings, &strings,
|
||||
FALSE, // need not be a stable sort
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return result;
|
||||
}
|
||||
// Duplicate strings are not allowed.
|
||||
UnicodeString prev=elements[0].getString(strings);
|
||||
for(int32_t i=1; i<elementsLength; ++i) {
|
||||
UnicodeString current=elements[i].getString(strings);
|
||||
if(prev==current) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return result;
|
||||
}
|
||||
prev.fastCopyFrom(current);
|
||||
}
|
||||
// Create and UChar-serialize the trie for the elements.
|
||||
if(ucharsCapacity<1024) {
|
||||
ucharsCapacity=1024;
|
||||
}
|
||||
uchars=reinterpret_cast<UChar *>(uprv_malloc(ucharsCapacity*2));
|
||||
if(uchars==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return result;
|
||||
}
|
||||
if(buildOption==UDICTTRIE_BUILD_FAST) {
|
||||
writeNode(0, elementsLength, 0);
|
||||
} else /* UDICTTRIE_BUILD_SMALL */ {
|
||||
createCompactBuilder(2*elementsLength, errorCode);
|
||||
Node *root=makeNode(0, elementsLength, 0, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
root->markRightEdgesFirst(-1);
|
||||
root->write(*this);
|
||||
}
|
||||
deleteCompactBuilder();
|
||||
}
|
||||
if(uchars==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length unitIndex.
|
||||
void
|
||||
UCharTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
int32_t type;
|
||||
if(unitIndex==elements[start].getStringLength(strings)) {
|
||||
// An intermediate or final value.
|
||||
value=elements[start++].getValue();
|
||||
if(start==limit) {
|
||||
writeValueAndFinal(value, TRUE); // final-value node
|
||||
return;
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
// Now all [start..limit[ strings are longer than unitIndex.
|
||||
const UCharTrieElement &minElement=elements[start];
|
||||
const UCharTrieElement &maxElement=elements[limit-1];
|
||||
int32_t minUnit=minElement.charAt(unitIndex, strings);
|
||||
int32_t maxUnit=maxElement.charAt(unitIndex, strings);
|
||||
if(minUnit==maxUnit) {
|
||||
// Linear-match node: All strings have the same character at unitIndex.
|
||||
int32_t minStringLength=minElement.getStringLength(strings);
|
||||
int32_t lastUnitIndex=unitIndex;
|
||||
while(++lastUnitIndex<minStringLength &&
|
||||
minElement.charAt(lastUnitIndex, strings)==
|
||||
maxElement.charAt(lastUnitIndex, strings)) {}
|
||||
writeNode(start, limit, lastUnitIndex);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
const UChar *s=minElement.getString(strings).getBuffer();
|
||||
int32_t length=lastUnitIndex-unitIndex;
|
||||
while(length>UCharTrie::kMaxLinearMatchLength) {
|
||||
lastUnitIndex-=UCharTrie::kMaxLinearMatchLength;
|
||||
length-=UCharTrie::kMaxLinearMatchLength;
|
||||
write(s+lastUnitIndex, UCharTrie::kMaxLinearMatchLength);
|
||||
write(UCharTrie::kMinLinearMatch+UCharTrie::kMaxLinearMatchLength-1);
|
||||
}
|
||||
write(s+unitIndex, length);
|
||||
type=UCharTrie::kMinLinearMatch+length-1;
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=0; // Number of different units at unitIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
// length>=2 because minUnit!=maxUnit.
|
||||
writeBranchSubNode(start, limit, unitIndex, length);
|
||||
if(--length<UCharTrie::kMinLinearMatch) {
|
||||
type=length;
|
||||
} else {
|
||||
write(length);
|
||||
type=0;
|
||||
}
|
||||
}
|
||||
writeValueAndType(hasValue, value, type);
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than unitIndex &&
|
||||
// length different units at unitIndex
|
||||
void
|
||||
UCharTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
|
||||
UChar middleUnits[16];
|
||||
int32_t lessThan[16];
|
||||
int32_t ltLength=0;
|
||||
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
|
||||
// Branch on the middle unit.
|
||||
// First, find the middle unit.
|
||||
int32_t count=length/2;
|
||||
int32_t i=start;
|
||||
UChar unit;
|
||||
do {
|
||||
unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
// Encode the less-than branch first.
|
||||
unit=middleUnits[ltLength]=elements[i].charAt(unitIndex, strings); // middle unit
|
||||
writeBranchSubNode(start, i, unitIndex, length/2);
|
||||
lessThan[ltLength]=ucharsLength;
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
// For each unit, find its elements array start and whether it has a final value.
|
||||
int32_t starts[UCharTrie::kMaxBranchLinearSubNodeLength];
|
||||
UBool final[UCharTrie::kMaxBranchLinearSubNodeLength-1];
|
||||
int32_t unitNumber=0;
|
||||
do {
|
||||
int32_t i=starts[unitNumber]=start;
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
final[unitNumber]= start==i-1 && unitIndex+1==elements[start].getStringLength(strings);
|
||||
start=i;
|
||||
} while(++unitNumber<length-1);
|
||||
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
|
||||
starts[unitNumber]=start;
|
||||
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minUnit sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minUnit sub-node last, for a shorter delta.
|
||||
int32_t jumpTargets[UCharTrie::kMaxBranchLinearSubNodeLength-1];
|
||||
do {
|
||||
--unitNumber;
|
||||
if(!final[unitNumber]) {
|
||||
writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
|
||||
jumpTargets[unitNumber]=ucharsLength;
|
||||
}
|
||||
} while(unitNumber>0);
|
||||
// The maxUnit sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
unitNumber=length-1;
|
||||
writeNode(start, limit, unitIndex+1);
|
||||
write(elements[start].charAt(unitIndex, strings));
|
||||
// Write the rest of this node's unit-value pairs.
|
||||
while(--unitNumber>=0) {
|
||||
start=starts[unitNumber];
|
||||
int32_t value;
|
||||
if(final[unitNumber]) {
|
||||
// Write the final value for the one string ending with this unit.
|
||||
value=elements[start].getValue();
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
value=ucharsLength-jumpTargets[unitNumber];
|
||||
}
|
||||
writeValueAndFinal(value, final[unitNumber]);
|
||||
write(elements[start].charAt(unitIndex, strings));
|
||||
}
|
||||
// Write the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
writeDelta(ucharsLength-lessThan[ltLength]); // less-than
|
||||
write(middleUnits[ltLength]);
|
||||
}
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length unitIndex.
|
||||
DictTrieBuilder::Node *
|
||||
UCharTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
if(unitIndex==elements[start].getStringLength(strings)) {
|
||||
// An intermediate or final value.
|
||||
value=elements[start++].getValue();
|
||||
if(start==limit) {
|
||||
return registerFinalValue(value, errorCode);
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
ValueNode *node;
|
||||
// Now all [start..limit[ strings are longer than unitIndex.
|
||||
const UCharTrieElement &minElement=elements[start];
|
||||
const UCharTrieElement &maxElement=elements[limit-1];
|
||||
int32_t minUnit=minElement.charAt(unitIndex, strings);
|
||||
int32_t maxUnit=maxElement.charAt(unitIndex, strings);
|
||||
if(minUnit==maxUnit) {
|
||||
// Linear-match node: All strings have the same character at unitIndex.
|
||||
int32_t minStringLength=minElement.getStringLength(strings);
|
||||
int32_t lastUnitIndex=unitIndex;
|
||||
while(++lastUnitIndex<minStringLength &&
|
||||
minElement.charAt(lastUnitIndex, strings)==
|
||||
maxElement.charAt(lastUnitIndex, strings)) {}
|
||||
Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
const UChar *s=minElement.getString(strings).getBuffer();
|
||||
int32_t length=lastUnitIndex-unitIndex;
|
||||
while(length>UCharTrie::kMaxLinearMatchLength) {
|
||||
lastUnitIndex-=UCharTrie::kMaxLinearMatchLength;
|
||||
length-=UCharTrie::kMaxLinearMatchLength;
|
||||
node=new UCTLinearMatchNode(
|
||||
s+lastUnitIndex,
|
||||
UCharTrie::kMaxLinearMatchLength,
|
||||
nextNode);
|
||||
node=(ValueNode *)registerNode(node, errorCode);
|
||||
nextNode=node;
|
||||
}
|
||||
node=new UCTLinearMatchNode(s+unitIndex, length, nextNode);
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=0; // Number of different units at unitIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
// length>=2 because minUnit!=maxUnit.
|
||||
Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
|
||||
node=new UCTBranchHeadNode(length, subNode);
|
||||
}
|
||||
if(hasValue && node!=NULL) {
|
||||
node->setValue(value);
|
||||
}
|
||||
return registerNode(node, errorCode);
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than unitIndex &&
|
||||
// length different units at unitIndex
|
||||
DictTrieBuilder::Node *
|
||||
UCharTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
|
||||
int32_t length, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UChar middleUnits[16];
|
||||
Node *lessThan[16];
|
||||
int32_t ltLength=0;
|
||||
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
|
||||
// Branch on the middle unit.
|
||||
// First, find the middle unit.
|
||||
int32_t count=length/2;
|
||||
int32_t i=start;
|
||||
UChar unit;
|
||||
do {
|
||||
unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
// Create the less-than branch.
|
||||
unit=middleUnits[ltLength]=elements[i].charAt(unitIndex, strings); // middle unit
|
||||
lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UCTListBranchNode *listNode=new UCTListBranchNode();
|
||||
if(listNode==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
// For each unit, find its elements array start and whether it has a final value.
|
||||
int32_t unitNumber=0;
|
||||
do {
|
||||
int32_t i=start;
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
if(start==i-1 && unitIndex+1==elements[start].getStringLength(strings)) {
|
||||
listNode->add(unit, elements[start].getValue());
|
||||
} else {
|
||||
listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
|
||||
}
|
||||
start=i;
|
||||
} while(++unitNumber<length-1);
|
||||
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
|
||||
UChar unit=elements[start].charAt(unitIndex, strings);
|
||||
if(start==limit-1 && unitIndex+1==elements[start].getStringLength(strings)) {
|
||||
listNode->add(unit, elements[start].getValue());
|
||||
} else {
|
||||
listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
|
||||
}
|
||||
Node *node=registerNode(listNode, errorCode);
|
||||
// Create the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
node=registerNode(
|
||||
new UCTSplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTFinalValueNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
offset=b.writeValueAndFinal(value, TRUE);
|
||||
}
|
||||
|
||||
UCharTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
|
||||
: LinearMatchNode(len, nextNode), s(units) {
|
||||
hash=hash*37+uhash_hashUCharsN(units, len);
|
||||
}
|
||||
|
||||
UBool
|
||||
UCharTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!LinearMatchNode::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
|
||||
return 0==u_memcmp(s, o.s, length);
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTLinearMatchNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
b.write(s, length);
|
||||
offset=b.writeValueAndType(hasValue, value, UCharTrie::kMinLinearMatch+length-1);
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTListBranchNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minUnit sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minUnit sub-node last, for a shorter delta.
|
||||
int32_t unitNumber=length-1;
|
||||
Node *rightEdge=equal[unitNumber];
|
||||
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
|
||||
do {
|
||||
--unitNumber;
|
||||
if(equal[unitNumber]!=NULL) {
|
||||
equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
|
||||
}
|
||||
} while(unitNumber>0);
|
||||
// The maxUnit sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
unitNumber=length-1;
|
||||
if(rightEdge==NULL) {
|
||||
b.writeValueAndFinal(values[unitNumber], TRUE);
|
||||
} else {
|
||||
rightEdge->write(builder);
|
||||
}
|
||||
b.write(units[unitNumber]);
|
||||
// Write the rest of this node's unit-value pairs.
|
||||
while(--unitNumber>=0) {
|
||||
int32_t value;
|
||||
UBool isFinal;
|
||||
if(equal[unitNumber]==NULL) {
|
||||
// Write the final value for the one string ending with this unit.
|
||||
value=values[unitNumber];
|
||||
isFinal=TRUE;
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
U_ASSERT(equal[unitNumber]->getOffset()>0);
|
||||
value=b.ucharsLength-equal[unitNumber]->getOffset();
|
||||
isFinal=FALSE;
|
||||
}
|
||||
b.writeValueAndFinal(value, isFinal);
|
||||
offset=b.write(units[unitNumber]);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTSplitBranchNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
// Encode the less-than branch first.
|
||||
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
|
||||
// Encode the greater-or-equal branch last because we do not jump for it at all.
|
||||
greaterOrEqual->write(builder);
|
||||
// Write this node.
|
||||
U_ASSERT(lessThan->getOffset()>0);
|
||||
b.writeDelta(b.ucharsLength-lessThan->getOffset()); // less-than
|
||||
offset=b.write(unit);
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTBranchHeadNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
if(length<=UCharTrie::kMinLinearMatch) {
|
||||
offset=b.writeValueAndType(hasValue, value, length-1);
|
||||
} else {
|
||||
b.write(length-1);
|
||||
offset=b.writeValueAndType(hasValue, value, 0);
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
UCharTrieBuilder::ensureCapacity(int32_t length) {
|
||||
if(uchars==NULL) {
|
||||
return FALSE; // previous memory allocation had failed
|
||||
}
|
||||
if(length>ucharsCapacity) {
|
||||
int32_t newCapacity=ucharsCapacity;
|
||||
do {
|
||||
newCapacity*=2;
|
||||
} while(newCapacity<=length);
|
||||
UChar *newUChars=reinterpret_cast<UChar *>(uprv_malloc(newCapacity*2));
|
||||
if(newUChars==NULL) {
|
||||
// unable to allocate memory
|
||||
uprv_free(uchars);
|
||||
uchars=NULL;
|
||||
return FALSE;
|
||||
}
|
||||
u_memcpy(newUChars+(newCapacity-ucharsLength),
|
||||
uchars+(ucharsCapacity-ucharsLength), ucharsLength);
|
||||
uprv_free(uchars);
|
||||
uchars=newUChars;
|
||||
ucharsCapacity=newCapacity;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::write(int32_t unit) {
|
||||
int32_t newLength=ucharsLength+1;
|
||||
if(ensureCapacity(newLength)) {
|
||||
ucharsLength=newLength;
|
||||
uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
|
||||
}
|
||||
return ucharsLength;
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::write(const UChar *s, int32_t length) {
|
||||
int32_t newLength=ucharsLength+length;
|
||||
if(ensureCapacity(newLength)) {
|
||||
ucharsLength=newLength;
|
||||
u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
|
||||
}
|
||||
return ucharsLength;
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
|
||||
UChar intUnits[3];
|
||||
int32_t length;
|
||||
if(i<0 || i>UCharTrie::kMaxTwoUnitValue) {
|
||||
intUnits[0]=(UChar)(UCharTrie::kThreeUnitValueLead);
|
||||
intUnits[1]=(UChar)(i>>16);
|
||||
intUnits[2]=(UChar)i;
|
||||
length=3;
|
||||
} else if(i<=UCharTrie::kMaxOneUnitValue) {
|
||||
intUnits[0]=(UChar)(i);
|
||||
length=1;
|
||||
} else {
|
||||
intUnits[0]=(UChar)(UCharTrie::kMinTwoUnitValueLead+(i>>16));
|
||||
intUnits[1]=(UChar)i;
|
||||
length=2;
|
||||
}
|
||||
intUnits[0]=(UChar)(intUnits[0]|(final<<15));
|
||||
return write(intUnits, length);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
|
||||
if(!hasValue) {
|
||||
return write(node);
|
||||
}
|
||||
UChar intUnits[3];
|
||||
int32_t length;
|
||||
if(value<0 || value>UCharTrie::kMaxTwoUnitNodeValue) {
|
||||
intUnits[0]=(UChar)(UCharTrie::kThreeUnitNodeValueLead);
|
||||
intUnits[1]=(UChar)(value>>16);
|
||||
intUnits[2]=(UChar)value;
|
||||
length=3;
|
||||
} else if(value<=UCharTrie::kMaxOneUnitNodeValue) {
|
||||
intUnits[0]=(UChar)((value+1)<<6);
|
||||
length=1;
|
||||
} else {
|
||||
intUnits[0]=(UChar)(UCharTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
|
||||
intUnits[1]=(UChar)value;
|
||||
length=2;
|
||||
}
|
||||
intUnits[0]|=(UChar)node;
|
||||
return write(intUnits, length);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::writeDelta(int32_t i) {
|
||||
UChar intUnits[3];
|
||||
int32_t length;
|
||||
U_ASSERT(i>=0);
|
||||
if(i<=UCharTrie::kMaxOneUnitDelta) {
|
||||
length=0;
|
||||
} else if(i<=UCharTrie::kMaxTwoUnitDelta) {
|
||||
intUnits[0]=(UChar)(UCharTrie::kMinTwoUnitDeltaLead+(i>>16));
|
||||
length=1;
|
||||
} else {
|
||||
intUnits[0]=(UChar)(UCharTrie::kThreeUnitDeltaLead);
|
||||
intUnits[1]=(UChar)(i>>16);
|
||||
length=2;
|
||||
}
|
||||
intUnits[length++]=(UChar)i;
|
||||
return write(intUnits, length);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
112
icu4c/source/tools/toolutil/uchartriebuilder.h
Normal file
112
icu4c/source/tools/toolutil/uchartriebuilder.h
Normal file
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uchartriebuilder.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov14
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Builder class for UCharTrie dictionary trie.
|
||||
*/
|
||||
|
||||
#ifndef __UCHARTRIEBUILDER_H__
|
||||
#define __UCHARTRIEBUILDER_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "dicttriebuilder.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UCharTrieElement;
|
||||
|
||||
class U_TOOLUTIL_API UCharTrieBuilder : public DictTrieBuilder {
|
||||
public:
|
||||
UCharTrieBuilder()
|
||||
: elements(NULL), elementsCapacity(0), elementsLength(0),
|
||||
uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
|
||||
~UCharTrieBuilder();
|
||||
|
||||
UCharTrieBuilder &add(const UnicodeString &s, int32_t value, UErrorCode &errorCode);
|
||||
|
||||
UnicodeString &build(UDictTrieBuildOption buildOption, UnicodeString &result, UErrorCode &errorCode);
|
||||
|
||||
UCharTrieBuilder &clear() {
|
||||
strings.remove();
|
||||
elementsLength=0;
|
||||
ucharsLength=0;
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
void writeNode(int32_t start, int32_t limit, int32_t unitIndex);
|
||||
void writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
|
||||
|
||||
Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
|
||||
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
|
||||
int32_t length, UErrorCode &errorCode);
|
||||
|
||||
UBool ensureCapacity(int32_t length);
|
||||
int32_t write(int32_t unit);
|
||||
int32_t write(const UChar *s, int32_t length);
|
||||
int32_t writeValueAndFinal(int32_t i, UBool final);
|
||||
int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
|
||||
int32_t writeDelta(int32_t i);
|
||||
|
||||
// Compacting builder.
|
||||
class UCTFinalValueNode : public FinalValueNode {
|
||||
public:
|
||||
UCTFinalValueNode(int32_t v) : FinalValueNode(v) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class UCTLinearMatchNode : public LinearMatchNode {
|
||||
public:
|
||||
UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode);
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
private:
|
||||
const UChar *s;
|
||||
};
|
||||
|
||||
class UCTListBranchNode : public ListBranchNode {
|
||||
public:
|
||||
UCTListBranchNode() : ListBranchNode() {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class UCTSplitBranchNode : public SplitBranchNode {
|
||||
public:
|
||||
UCTSplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
|
||||
: SplitBranchNode(middleUnit, lessThanNode, greaterOrEqualNode) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class UCTBranchHeadNode : public BranchHeadNode {
|
||||
public:
|
||||
UCTBranchHeadNode(int32_t len, Node *subNode) : BranchHeadNode(len, subNode) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
virtual Node *createFinalValueNode(int32_t value) const { return new UCTFinalValueNode(value); }
|
||||
|
||||
UnicodeString strings;
|
||||
UCharTrieElement *elements;
|
||||
int32_t elementsCapacity;
|
||||
int32_t elementsLength;
|
||||
|
||||
// UChar serialization of the trie.
|
||||
// Grows from the back: ucharsLength measures from the end of the buffer!
|
||||
UChar *uchars;
|
||||
int32_t ucharsCapacity;
|
||||
int32_t ucharsLength;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __UCHARTRIEBUILDER_H__
|
181
icu4c/source/tools/toolutil/uchartrieiterator.cpp
Normal file
181
icu4c/source/tools/toolutil/uchartrieiterator.cpp
Normal file
|
@ -0,0 +1,181 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uchartrieiterator.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov15
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "uchartrie.h"
|
||||
#include "uchartrieiterator.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UCharTrieIterator::UCharTrieIterator(const UChar *trieUChars, int32_t maxStringLength,
|
||||
UErrorCode &errorCode)
|
||||
: uchars_(trieUChars),
|
||||
pos_(uchars_), initialPos_(uchars_),
|
||||
remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
|
||||
skipValue_(FALSE),
|
||||
maxLength_(maxStringLength), value_(0), stack_(errorCode) {}
|
||||
|
||||
UCharTrieIterator::UCharTrieIterator(const UCharTrie &trie, int32_t maxStringLength,
|
||||
UErrorCode &errorCode)
|
||||
: uchars_(trie.uchars_), pos_(trie.pos_), initialPos_(trie.pos_),
|
||||
remainingMatchLength_(trie.remainingMatchLength_),
|
||||
initialRemainingMatchLength_(trie.remainingMatchLength_),
|
||||
skipValue_(FALSE),
|
||||
maxLength_(maxStringLength), value_(0), stack_(errorCode) {
|
||||
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
|
||||
if(length>=0) {
|
||||
// Pending linear-match node, append remaining UChars to str.
|
||||
++length;
|
||||
if(maxLength_>0 && length>maxLength_) {
|
||||
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
|
||||
}
|
||||
str_.append(pos_, length);
|
||||
pos_+=length;
|
||||
remainingMatchLength_-=length;
|
||||
}
|
||||
}
|
||||
|
||||
UCharTrieIterator &UCharTrieIterator::reset() {
|
||||
pos_=initialPos_;
|
||||
remainingMatchLength_=initialRemainingMatchLength_;
|
||||
skipValue_=FALSE;
|
||||
int32_t length=remainingMatchLength_+1; // Remaining match length.
|
||||
if(maxLength_>0 && length>maxLength_) {
|
||||
length=maxLength_;
|
||||
}
|
||||
str_.truncate(length);
|
||||
pos_+=length;
|
||||
remainingMatchLength_-=length;
|
||||
stack_.setSize(0);
|
||||
return *this;
|
||||
}
|
||||
|
||||
UBool
|
||||
UCharTrieIterator::next(UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
const UChar *pos=pos_;
|
||||
if(pos==NULL) {
|
||||
if(stack_.isEmpty()) {
|
||||
return FALSE;
|
||||
}
|
||||
// Pop the state off the stack and continue with the next outbound edge of
|
||||
// the branch node.
|
||||
int32_t stackSize=stack_.size();
|
||||
int32_t length=stack_.elementAti(stackSize-1);
|
||||
pos=uchars_+stack_.elementAti(stackSize-2);
|
||||
stack_.setSize(stackSize-2);
|
||||
str_.truncate(length&0xffff);
|
||||
length=(int32_t)((uint32_t)length>>16);
|
||||
if(length>1) {
|
||||
pos=branchNext(pos, length, errorCode);
|
||||
if(pos==NULL) {
|
||||
return TRUE; // Reached a final value.
|
||||
}
|
||||
} else {
|
||||
str_.append(*pos++);
|
||||
}
|
||||
}
|
||||
if(remainingMatchLength_>=0) {
|
||||
// We only get here if we started in a pending linear-match node
|
||||
// with more than maxLength remaining units.
|
||||
return truncateAndStop();
|
||||
}
|
||||
for(;;) {
|
||||
int32_t node=*pos++;
|
||||
if(node>=UCharTrie::kMinValueLead) {
|
||||
if(skipValue_) {
|
||||
pos=UCharTrie::skipNodeValue(pos, node);
|
||||
node&=UCharTrie::kNodeTypeMask;
|
||||
skipValue_=FALSE;
|
||||
} else {
|
||||
// Deliver value for the string so far.
|
||||
UBool isFinal=(UBool)(node>>15);
|
||||
if(isFinal) {
|
||||
value_=UCharTrie::readValue(pos, node&0x7fff);
|
||||
} else {
|
||||
value_=UCharTrie::readNodeValue(pos, node);
|
||||
}
|
||||
if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
|
||||
pos_=NULL;
|
||||
} else {
|
||||
// We cannot skip the value right here because it shares its
|
||||
// lead unit with a match node which we have to evaluate
|
||||
// next time.
|
||||
// Instead, keep pos_ on the node lead unit itself.
|
||||
pos_=pos-1;
|
||||
skipValue_=TRUE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
if(maxLength_>0 && str_.length()==maxLength_) {
|
||||
return truncateAndStop();
|
||||
}
|
||||
if(node<UCharTrie::kMinLinearMatch) {
|
||||
if(node==0) {
|
||||
node=*pos++;
|
||||
}
|
||||
pos=branchNext(pos, node+1, errorCode);
|
||||
if(pos==NULL) {
|
||||
return TRUE; // Reached a final value.
|
||||
}
|
||||
} else {
|
||||
// Linear-match node, append length units to str_.
|
||||
int32_t length=node-UCharTrie::kMinLinearMatch+1;
|
||||
if(maxLength_>0 && str_.length()+length>maxLength_) {
|
||||
str_.append(pos, maxLength_-str_.length());
|
||||
return truncateAndStop();
|
||||
}
|
||||
str_.append(pos, length);
|
||||
pos+=length;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Branch node, needs to take the first outbound edge and push state for the rest.
|
||||
const UChar *
|
||||
UCharTrieIterator::branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode) {
|
||||
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
|
||||
++pos; // ignore the comparison unit
|
||||
// Push state for the greater-or-equal edge.
|
||||
stack_.addElement((int32_t)(UCharTrie::skipDelta(pos)-uchars_), errorCode);
|
||||
stack_.addElement(((length-(length>>1))<<16)|str_.length(), errorCode);
|
||||
// Follow the less-than edge.
|
||||
length>>=1;
|
||||
pos=UCharTrie::jumpByDelta(pos);
|
||||
}
|
||||
// List of key-value pairs where values are either final values or jump deltas.
|
||||
// Read the first (key, value) pair.
|
||||
UChar trieUnit=*pos++;
|
||||
int32_t node=*pos++;
|
||||
UBool isFinal=(UBool)(node>>15);
|
||||
int32_t value=UCharTrie::readValue(pos, node&=0x7fff);
|
||||
pos=UCharTrie::skipValue(pos, node);
|
||||
stack_.addElement((int32_t)(pos-uchars_), errorCode);
|
||||
stack_.addElement(((length-1)<<16)|str_.length(), errorCode);
|
||||
str_.append(trieUnit);
|
||||
if(isFinal) {
|
||||
pos_=NULL;
|
||||
value_=value;
|
||||
return NULL;
|
||||
} else {
|
||||
return pos+value;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
121
icu4c/source/tools/toolutil/uchartrieiterator.h
Normal file
121
icu4c/source/tools/toolutil/uchartrieiterator.h
Normal file
|
@ -0,0 +1,121 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uchartrieiterator.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010nov15
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __UCHARTRIEITERATOR_H__
|
||||
#define __UCHARTRIEITERATOR_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: UCharTrie iterator for all of its (string, value) pairs.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "uchartrie.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Iterator for all of the (string, value) pairs in a UCharTrie.
|
||||
*/
|
||||
class U_TOOLUTIL_API UCharTrieIterator : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* Iterates from the root of a UChar-serialized UCharTrie.
|
||||
* @param trieUChars The trie UChars.
|
||||
* @param maxStringLength If 0, the iterator returns full strings.
|
||||
* Otherwise, the iterator returns strings with this maximum length.
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
*/
|
||||
UCharTrieIterator(const UChar *trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Iterates from the current state of the specified UCharTrie.
|
||||
* @param trie The trie whose state will be copied for iteration.
|
||||
* @param maxStringLength If 0, the iterator returns full strings.
|
||||
* Otherwise, the iterator returns strings with this maximum length.
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
*/
|
||||
UCharTrieIterator(const UCharTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Resets this iterator to its initial state.
|
||||
*/
|
||||
UCharTrieIterator &reset();
|
||||
|
||||
/**
|
||||
* Finds the next (string, value) pair if there is one.
|
||||
*
|
||||
* If the string is truncated to the maximum length and does not
|
||||
* have a real value, then the value is set to -1.
|
||||
* In this case, this "not a real value" is indistinguishable from
|
||||
* a real value of -1.
|
||||
* @return TRUE if there is another element.
|
||||
*/
|
||||
UBool next(UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* @return TRUE if there are more elements.
|
||||
*/
|
||||
UBool hasNext() const { return pos_!=NULL || !stack_.isEmpty(); }
|
||||
|
||||
/**
|
||||
* @return the NUL-terminated string for the last successful next()
|
||||
*/
|
||||
const UnicodeString &getString() const { return str_; }
|
||||
/**
|
||||
* @return the value for the last successful next()
|
||||
*/
|
||||
int32_t getValue() const { return value_; }
|
||||
|
||||
private:
|
||||
UBool truncateAndStop() {
|
||||
pos_=NULL;
|
||||
value_=-1; // no real value for str
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
const UChar *branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode);
|
||||
|
||||
const UChar *uchars_;
|
||||
const UChar *pos_;
|
||||
const UChar *initialPos_;
|
||||
int32_t remainingMatchLength_;
|
||||
int32_t initialRemainingMatchLength_;
|
||||
UBool skipValue_; // Skip intermediate value which was already delivered.
|
||||
|
||||
UnicodeString str_;
|
||||
int32_t maxLength_;
|
||||
int32_t value_;
|
||||
|
||||
// The stack stores pairs of integers for backtracking to another
|
||||
// outbound edge of a branch node.
|
||||
// The first integer is an offset from ByteTrie.bytes.
|
||||
// The second integer has the str.length() from before the node in bits 15..0,
|
||||
// and the remaining branch length in bits 31..16.
|
||||
// (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
|
||||
// but the code looks more confusing that way.)
|
||||
UVector32 stack_;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __UCHARTRIEITERATOR_H__
|
|
@ -216,3 +216,42 @@ usrc_writeUTrie2Struct(FILE *f,
|
|||
fputs(postfix, f);
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
usrc_writeArrayOfMostlyInvChars(FILE *f,
|
||||
const char *prefix,
|
||||
const char *p, int32_t length,
|
||||
const char *postfix) {
|
||||
int32_t i, col;
|
||||
int prev2, prev, c;
|
||||
|
||||
if(prefix!=NULL) {
|
||||
fprintf(f, prefix, (long)length);
|
||||
}
|
||||
prev2=prev=-1;
|
||||
for(i=col=0; i<length; ++i, ++col) {
|
||||
c=(uint8_t)p[i];
|
||||
if(i>0) {
|
||||
/* Break long lines. Try to break at interesting places, to minimize revision diffs. */
|
||||
if(
|
||||
/* Very long line. */
|
||||
col>=32 ||
|
||||
/* Long line, break after terminating NUL. */
|
||||
(col>=24 && prev2>=0x20 && prev==0) ||
|
||||
/* Medium-long line, break before non-NUL, non-character byte. */
|
||||
(col>=16 && (prev==0 || prev>=0x20) && 0<c && c<0x20)
|
||||
) {
|
||||
fputs(",\n", f);
|
||||
col=0;
|
||||
} else {
|
||||
fputc(',', f);
|
||||
}
|
||||
}
|
||||
fprintf(f, c<0x20 ? "%u" : "'%c'", c);
|
||||
prev2=prev;
|
||||
prev=c;
|
||||
}
|
||||
if(postfix!=NULL) {
|
||||
fputs(postfix, f);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,21 +24,21 @@
|
|||
#include "utrie2.h"
|
||||
|
||||
/**
|
||||
* Create a source text file and write a header comment with the ICU copyright.
|
||||
* Creates a source text file and write a header comment with the ICU copyright.
|
||||
* Writes a C/Java-style comment.
|
||||
*/
|
||||
U_CAPI FILE * U_EXPORT2
|
||||
usrc_create(const char *path, const char *filename);
|
||||
|
||||
/**
|
||||
* Create a source text file and write a header comment with the ICU copyright.
|
||||
* Creates a source text file and write a header comment with the ICU copyright.
|
||||
* Writes the comment with # lines, as used in scripts and text data.
|
||||
*/
|
||||
U_CAPI FILE * U_EXPORT2
|
||||
usrc_createTextData(const char *path, const char *filename);
|
||||
|
||||
/**
|
||||
* Write the contents of an array of 8/16/32-bit words.
|
||||
* Writes the contents of an array of 8/16/32-bit words.
|
||||
* The prefix and postfix are optional (can be NULL) and are written first/last.
|
||||
* The prefix may contain a %ld or similar field for the array length.
|
||||
* The {} and declaration etc. need to be included in prefix/postfix or
|
||||
|
@ -73,4 +73,20 @@ usrc_writeUTrie2Struct(FILE *f,
|
|||
const char *indexName, const char *dataName,
|
||||
const char *postfix);
|
||||
|
||||
/**
|
||||
* Writes the contents of an array of mostly invariant characters.
|
||||
* Characters 0..0x1f are printed as numbers,
|
||||
* others as characters with single quotes: '%c'.
|
||||
*
|
||||
* The prefix and postfix are optional (can be NULL) and are written first/last.
|
||||
* The prefix may contain a %ld or similar field for the array length.
|
||||
* The {} and declaration etc. need to be included in prefix/postfix or
|
||||
* printed before and after the array contents.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
usrc_writeArrayOfMostlyInvChars(FILE *f,
|
||||
const char *prefix,
|
||||
const char *p, int32_t length,
|
||||
const char *postfix);
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Add table
Reference in a new issue