ICU-8105 hardcode Unicode property names data (formatVersion 2); includes new dictionary-type tries (ByteTrie & UCharTrie see ticket #8167); merge branches/markus/tries -r 29040:29249

X-SVN-Rev: 29252
This commit is contained in:
Markus Scherer 2010-12-31 18:21:36 +00:00
parent 3e29cb9f1f
commit c04082d93c
53 changed files with 9209 additions and 1067 deletions

View file

@ -85,7 +85,7 @@ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \
ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \
uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \
ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o loclikely.o locresdata.o \
bytestream.o stringpiece.o \
bytestream.o stringpiece.o bytetrie.o \
ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o unorm_it.o \

View file

@ -0,0 +1,431 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetrie.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/bytestream.h"
#include "unicode/uobject.h"
#include "uassert.h"
#include "bytetrie.h"
U_NAMESPACE_BEGIN
// lead byte already shifted right by 1.
int32_t
ByteTrie::readValue(const uint8_t *pos, int32_t leadByte) {
int32_t value;
if(leadByte<kMinTwoByteValueLead) {
value=leadByte-kMinOneByteValueLead;
} else if(leadByte<kMinThreeByteValueLead) {
value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
} else if(leadByte<kFourByteValueLead) {
value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
} else if(leadByte==kFourByteValueLead) {
value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
} else {
value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
}
return value;
}
const uint8_t *
ByteTrie::jumpByDelta(const uint8_t *pos) {
int32_t delta=*pos++;
if(delta<kMinTwoByteDeltaLead) {
// nothing to do
} else if(delta<kMinThreeByteDeltaLead) {
delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
} else if(delta<kFourByteDeltaLead) {
delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
pos+=2;
} else if(delta==kFourByteDeltaLead) {
delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
pos+=3;
} else {
delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
pos+=4;
}
return pos+delta;
}
UDictTrieResult
ByteTrie::current() const {
const uint8_t *pos=pos_;
if(pos==NULL) {
return UDICTTRIE_NO_MATCH;
} else {
int32_t node;
return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
}
}
UDictTrieResult
ByteTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
// Branch according to the current byte.
if(length==0) {
length=*pos++;
}
++length;
// The length of the branch is the number of bytes to select from.
// The data structure encodes a binary search.
while(length>kMaxBranchLinearSubNodeLength) {
if(inByte<*pos++) {
length>>=1;
pos=jumpByDelta(pos);
} else {
length=length-(length>>1);
pos=skipDelta(pos);
}
}
// Drop down to linear search for the last few bytes.
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
// and divides length by 2.
do {
if(inByte==*pos++) {
UDictTrieResult result;
int32_t node=*pos;
U_ASSERT(node>=kMinValueLead);
if(node&kValueIsFinal) {
// Leave the final value for getValue() to read.
result=UDICTTRIE_HAS_FINAL_VALUE;
} else {
// Use the non-final value as the jump delta.
++pos;
// int32_t delta=readValue(pos, node>>1);
node>>=1;
int32_t delta;
if(node<kMinTwoByteValueLead) {
delta=node-kMinOneByteValueLead;
} else if(node<kMinThreeByteValueLead) {
delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
} else if(node<kFourByteValueLead) {
delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
pos+=2;
} else if(node==kFourByteValueLead) {
delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
pos+=3;
} else {
delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
pos+=4;
}
// end readValue()
pos+=delta;
node=*pos;
result= node>=kMinValueLead ? valueResult(node) : UDICTTRIE_NO_VALUE;
}
pos_=pos;
return result;
}
--length;
pos=skipValue(pos);
} while(length>1);
if(inByte==*pos++) {
pos_=pos;
int32_t node=*pos;
return node>=kMinValueLead ? valueResult(node) : UDICTTRIE_NO_VALUE;
} else {
stop();
return UDICTTRIE_NO_MATCH;
}
}
UDictTrieResult
ByteTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
for(;;) {
int32_t node=*pos++;
if(node<kMinLinearMatch) {
return branchNext(pos, node, inByte);
} else if(node<kMinValueLead) {
// Match the first of length+1 bytes.
int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
if(inByte==*pos++) {
remainingMatchLength_=--length;
pos_=pos;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
} else {
// No match.
break;
}
} else if(node&kValueIsFinal) {
// No further matching bytes.
break;
} else {
// Skip intermediate value.
pos=skipValue(pos, node);
// The next node must not also be a value node.
U_ASSERT(*pos<kMinValueLead);
}
}
stop();
return UDICTTRIE_NO_MATCH;
}
UDictTrieResult
ByteTrie::next(int32_t inByte) {
const uint8_t *pos=pos_;
if(pos==NULL) {
return UDICTTRIE_NO_MATCH;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Remaining part of a linear-match node.
if(inByte==*pos++) {
remainingMatchLength_=--length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
} else {
stop();
return UDICTTRIE_NO_MATCH;
}
}
return nextImpl(pos, inByte);
}
UDictTrieResult
ByteTrie::next(const char *s, int32_t sLength) {
if(sLength<0 ? *s==0 : sLength==0) {
// Empty input.
return current();
}
const uint8_t *pos=pos_;
if(pos==NULL) {
return UDICTTRIE_NO_MATCH;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
for(;;) {
// Fetch the next input byte, if there is one.
// Continue a linear-match node without rechecking sLength<0.
int32_t inByte;
if(sLength<0) {
for(;;) {
if((inByte=*s++)==0) {
remainingMatchLength_=length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
}
if(length<0) {
remainingMatchLength_=length;
break;
}
if(inByte!=*pos) {
stop();
return UDICTTRIE_NO_MATCH;
}
++pos;
--length;
}
} else {
for(;;) {
if(sLength==0) {
remainingMatchLength_=length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
}
inByte=*s++;
--sLength;
if(length<0) {
remainingMatchLength_=length;
break;
}
if(inByte!=*pos) {
stop();
return UDICTTRIE_NO_MATCH;
}
++pos;
--length;
}
}
for(;;) {
int32_t node=*pos++;
if(node<kMinLinearMatch) {
UDictTrieResult result=branchNext(pos, node, inByte);
if(result==UDICTTRIE_NO_MATCH) {
return UDICTTRIE_NO_MATCH;
}
// Fetch the next input byte, if there is one.
if(sLength<0) {
if((inByte=*s++)==0) {
return result;
}
} else {
if(sLength==0) {
return result;
}
inByte=*s++;
--sLength;
}
if(result==UDICTTRIE_HAS_FINAL_VALUE) {
// No further matching bytes.
stop();
return UDICTTRIE_NO_MATCH;
}
pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
} else if(node<kMinValueLead) {
// Match length+1 bytes.
length=node-kMinLinearMatch; // Actual match length minus 1.
if(inByte!=*pos) {
stop();
return UDICTTRIE_NO_MATCH;
}
++pos;
--length;
break;
} else if(node&kValueIsFinal) {
// No further matching bytes.
stop();
return UDICTTRIE_NO_MATCH;
} else {
// Skip intermediate value.
pos=skipValue(pos, node);
// The next node must not also be a value node.
U_ASSERT(*pos<kMinValueLead);
}
}
}
}
const uint8_t *
ByteTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
UBool haveUniqueValue, int32_t &uniqueValue) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
return NULL;
}
length=length-(length>>1);
pos=skipDelta(pos);
}
do {
++pos; // ignore a comparison byte
// handle its value
int32_t node=*pos++;
UBool isFinal=(UBool)(node&kValueIsFinal);
int32_t value=readValue(pos, node>>1);
pos=skipValue(pos, node);
if(isFinal) {
if(haveUniqueValue) {
if(value!=uniqueValue) {
return NULL;
}
} else {
uniqueValue=value;
haveUniqueValue=TRUE;
}
} else {
if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
return NULL;
}
haveUniqueValue=TRUE;
}
} while(--length>1);
return pos+1; // ignore the last comparison byte
}
UBool
ByteTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
for(;;) {
int32_t node=*pos++;
if(node<kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
if(pos==NULL) {
return FALSE;
}
haveUniqueValue=TRUE;
} else if(node<kMinValueLead) {
// linear-match node
pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
} else {
UBool isFinal=(UBool)(node&kValueIsFinal);
int32_t value=readValue(pos, node>>1);
if(haveUniqueValue) {
if(value!=uniqueValue) {
return FALSE;
}
} else {
uniqueValue=value;
haveUniqueValue=TRUE;
}
if(isFinal) {
return TRUE;
}
pos=skipValue(pos, node);
}
}
}
int32_t
ByteTrie::getNextBytes(ByteSink &out) const {
const uint8_t *pos=pos_;
if(pos==NULL) {
return 0;
}
if(remainingMatchLength_>=0) {
append(out, *pos); // Next byte of a pending linear-match node.
return 1;
}
int32_t node=*pos++;
if(node>=kMinValueLead) {
if(node&kValueIsFinal) {
return 0;
} else {
pos=skipValue(pos, node);
node=*pos++;
U_ASSERT(node<kMinValueLead);
}
}
if(node<kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
getNextBranchBytes(pos, ++node, out);
return node;
} else {
// First byte of the linear-match node.
append(out, *pos);
return 1;
}
}
void
ByteTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
getNextBranchBytes(jumpByDelta(pos), length>>1, out);
length=length-(length>>1);
pos=skipDelta(pos);
}
do {
append(out, *pos++);
pos=skipValue(pos);
} while(--length>1);
append(out, *pos);
}
void
ByteTrie::append(ByteSink &out, int c) {
char ch=(char)c;
out.Append(&ch, 1);
}
U_NAMESPACE_END

View file

@ -0,0 +1,331 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetrie.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*/
#ifndef __BYTETRIE_H__
#define __BYTETRIE_H__
/**
* \file
* \brief C++ API: Dictionary trie for mapping arbitrary byte sequences
* to integer values.
*/
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "uassert.h"
#include "udicttrie.h"
U_NAMESPACE_BEGIN
class ByteSink;
class ByteTrieBuilder;
class ByteTrieIterator;
/**
* Light-weight, non-const reader class for a ByteTrie.
* Traverses a byte-serialized data structure with minimal state,
* for mapping byte sequences to non-negative integer values.
*/
class U_COMMON_API ByteTrie : public UMemory {
public:
ByteTrie(const void *trieBytes)
: bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
pos_(bytes_), remainingMatchLength_(-1) {}
/**
* Resets this trie to its initial state.
*/
ByteTrie &reset() {
pos_=bytes_;
remainingMatchLength_=-1;
return *this;
}
/**
* ByteTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
*/
class State : public UMemory {
public:
State() { bytes=NULL; }
private:
friend class ByteTrie;
const uint8_t *bytes;
const uint8_t *pos;
int32_t remainingMatchLength;
};
/**
* Saves the state of this trie.
* @see resetToState
*/
const ByteTrie &saveState(State &state) const {
state.bytes=bytes_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return *this;
}
/**
* Resets this trie to the saved state.
* If the state object contains no state, or the state of a different trie,
* then this trie remains unchanged.
* @see saveState
* @see reset
*/
ByteTrie &resetToState(const State &state) {
if(bytes_==state.bytes && bytes_!=NULL) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
}
return *this;
}
/**
* Determines whether the byte sequence so far matches, whether it has a value,
* and whether another input byte can continue a matching byte sequence.
* @return The match/value Result.
*/
UDictTrieResult current() const;
/**
* Traverses the trie from the initial state for this input byte.
* Equivalent to reset().next(inByte).
* @return The match/value Result.
*/
inline UDictTrieResult first(int32_t inByte) {
remainingMatchLength_=-1;
return nextImpl(bytes_, inByte);
}
/**
* Traverses the trie from the current state for this input byte.
* @return The match/value Result.
*/
UDictTrieResult next(int32_t inByte);
/**
* Traverses the trie from the current state for this byte sequence.
* Equivalent to
* \code
* Result result=current();
* for(each c in s)
* if((result=next(c))==UDICTTRIE_NO_MATCH) return UDICTTRIE_NO_MATCH;
* return result;
* \endcode
* @return The match/value Result.
*/
UDictTrieResult next(const char *s, int32_t length);
/**
* Returns a matching byte sequence's value if called immediately after
* current()/first()/next() returned UDICTTRIE_HAS_VALUE or UDICTTRIE_HAS_FINAL_VALUE.
* getValue() can be called multiple times.
*
* Do not call getValue() after UDICTTRIE_NO_MATCH or UDICTTRIE_NO_VALUE!
*/
inline int32_t getValue() const {
const uint8_t *pos=pos_;
int32_t leadByte=*pos++;
U_ASSERT(leadByte>=kMinValueLead);
return readValue(pos, leadByte>>1);
}
/**
* Determines whether all byte sequences reachable from the current state
* map to the same value.
* @param uniqueValue Receives the unique value, if this function returns TRUE.
* (output-only)
* @return TRUE if all byte sequences reachable from the current state
* map to the same value.
*/
inline UBool hasUniqueValue(int32_t &uniqueValue) const {
const uint8_t *pos=pos_;
// Skip the rest of a pending linear-match node.
return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
}
/**
* Finds each byte which continues the byte sequence from the current state.
* That is, each byte b for which it would be next(b)!=UDICTTRIE_NO_MATCH now.
* @param out Each next byte is appended to this object.
* (Only uses the out.Append(s, length) method.)
* @return the number of bytes which continue the byte sequence from here
*/
int32_t getNextBytes(ByteSink &out) const;
private:
friend class ByteTrieBuilder;
friend class ByteTrieIterator;
inline void stop() {
pos_=NULL;
}
// Reads a compact 32-bit integer.
// pos is already after the leadByte, and the lead byte is already shifted right by 1.
static int32_t readValue(const uint8_t *pos, int32_t leadByte);
static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
U_ASSERT(leadByte>=kMinValueLead);
if(leadByte>=(kMinTwoByteValueLead<<1)) {
if(leadByte<(kMinThreeByteValueLead<<1)) {
++pos;
} else if(leadByte<(kFourByteValueLead<<1)) {
pos+=2;
} else {
pos+=3+((leadByte>>1)&1);
}
}
return pos;
}
static inline const uint8_t *skipValue(const uint8_t *pos) {
int32_t leadByte=*pos++;
return skipValue(pos, leadByte);
}
// Reads a jump delta and jumps.
static const uint8_t *jumpByDelta(const uint8_t *pos);
static inline const uint8_t *skipDelta(const uint8_t *pos) {
int32_t delta=*pos++;
if(delta>=kMinTwoByteDeltaLead) {
if(delta<kMinThreeByteDeltaLead) {
++pos;
} else if(delta<kFourByteDeltaLead) {
pos+=2;
} else {
pos+=3+(delta&1);
}
}
return pos;
}
static inline UDictTrieResult valueResult(int32_t node) {
return (UDictTrieResult)(UDICTTRIE_HAS_VALUE-(node&kValueIsFinal));
}
// Handles a branch node for both next(byte) and next(string).
UDictTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
// Requires remainingLength_<0.
UDictTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
// Helper functions for hasUniqueValue().
// Recursively finds a unique value (or whether there is not a unique one)
// from a branch.
static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
UBool haveUniqueValue, int32_t &uniqueValue);
// Recursively finds a unique value (or whether there is not a unique one)
// starting from a position on a node lead byte.
static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
// Helper functions for getNextBytes().
// getNextBytes() when pos is on a branch node.
static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
static void append(ByteSink &out, int c);
// ByteTrie data structure
//
// The trie consists of a series of byte-serialized nodes for incremental
// string/byte sequence matching. The root node is at the beginning of the trie data.
//
// Types of nodes are distinguished by their node lead byte ranges.
// After each node, except a final-value node, another node follows to
// encode match values or continue matching further bytes.
//
// Node types:
// - Value node: Stores a 32-bit integer in a compact, variable-length format.
// The value is for the string/byte sequence so far.
// One node bit indicates whether the value is final or whether
// matching continues with the next node.
// - Linear-match node: Matches a number of bytes.
// - Branch node: Branches to other nodes according to the current input byte.
// The node byte is the length of the branch (number of bytes to select from)
// minus 1. It is followed by a sub-node:
// - If the length is at most kMaxBranchLinearSubNodeLength, then
// there are length-1 (key, value) pairs and then one more comparison byte.
// If one of the key bytes matches, then the value is either a final value for
// the string/byte sequence so far, or a "jump" delta to the next node.
// If the last byte matches, then matching continues with the next node.
// (Values have the same encoding as value nodes.)
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
// there is one byte and one "jump" delta.
// If the input byte is less than the sub-node byte, then "jump" by delta to
// the next sub-node which will have a length of length/2.
// (The delta has its own compact encoding.)
// Otherwise, skip the "jump" delta to the next sub-node
// which will have a length of length-length/2.
// Node lead byte values.
// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next byte.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
static const int32_t kMaxBranchLinearSubNodeLength=5;
// 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
static const int32_t kMinLinearMatch=0x10;
static const int32_t kMaxLinearMatchLength=0x10;
// 20..ff: Variable-length value node.
// If odd, the value is final. (Otherwise, intermediate value or jump delta.)
// Then shift-right by 1 bit.
// The remaining lead byte value indicates the number of following bytes (0..4)
// and contains the value's top bits.
static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
// It is a final value if bit 0 is set.
static const int32_t kValueIsFinal=1;
// Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10
static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
static const int32_t kMaxTwoByteValue=0x1aff;
static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
static const int32_t kFourByteValueLead=0x7e;
// A little more than Unicode code points. (0x11ffff)
static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
static const int32_t kFiveByteValueLead=0x7f;
// Compact delta integers.
static const int32_t kMaxOneByteDelta=0xbf;
static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
static const int32_t kMinThreeByteDeltaLead=0xf0;
static const int32_t kFourByteDeltaLead=0xfe;
static const int32_t kFiveByteDeltaLead=0xff;
static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
// Fixed value referencing the ByteTrie bytes.
const uint8_t *bytes_;
// Iterator variables.
// Pointer to next trie byte to read. NULL if no more matches.
const uint8_t *pos_;
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
int32_t remainingMatchLength_;
};
U_NAMESPACE_END
#endif // __BYTETRIE_H__

View file

@ -400,6 +400,7 @@
<ClCompile Include="servslkf.cpp" />
<ClCompile Include="usprep.cpp" />
<ClCompile Include="bytestream.cpp" />
<ClCompile Include="bytetrie.cpp" />
<ClCompile Include="chariter.cpp" />
<ClCompile Include="charstr.cpp" />
<ClCompile Include="cstring.c" />
@ -1365,6 +1366,7 @@
</Command>
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
</CustomBuild>
<ClInclude Include="bytetrie.h" />
<CustomBuild Include="unicode\chariter.h">
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
</Command>
@ -1608,4 +1610,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

View file

@ -1,11 +1,12 @@
/*
**********************************************************************
* Copyright (c) 2002-2009, International Business Machines
* Copyright (c) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: October 30 2002
* Since: ICU 2.4
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
**********************************************************************
*/
#include "propname.h"
@ -16,6 +17,10 @@
#include "cstring.h"
#include "ucln_cmn.h"
#include "uarrsort.h"
#include "uinvchar.h"
#define INCLUDED_FROM_PROPNAME_CPP
#include "propname_data.h"
U_CDECL_BEGIN
@ -94,7 +99,7 @@ uprv_compareASCIIPropertyNames(const char *name1, const char *name2) {
if(((r1|r2)&0xff)==0) {
return 0;
}
/* Compare the lowercased characters */
if(r1!=r2) {
rc=(r1&0xff)-(r2&0xff);
@ -120,7 +125,7 @@ uprv_compareEBCDICPropertyNames(const char *name1, const char *name2) {
if(((r1|r2)&0xff)==0) {
return 0;
}
/* Compare the lowercased characters */
if(r1!=r2) {
rc=(r1&0xff)-(r2&0xff);
@ -138,615 +143,169 @@ U_CDECL_END
U_NAMESPACE_BEGIN
//----------------------------------------------------------------------
// PropertyAliases implementation
int32_t PropNameData::findProperty(int32_t property) {
int32_t i=1; // valueMaps index, initially after numRanges
for(int32_t numRanges=valueMaps[0]; numRanges>0; --numRanges) {
// Read and skip the start and limit of this range.
int32_t start=valueMaps[i];
int32_t limit=valueMaps[i+1];
i+=2;
if(property<start) {
break;
}
if(property<limit) {
return i+(property-start)*2;
}
i+=(limit-start)*2; // Skip all entries for this range.
}
return 0;
}
const char*
PropertyAliases::chooseNameInGroup(Offset offset,
UPropertyNameChoice choice) const {
int32_t c = choice;
if (!offset || c < 0) {
int32_t PropNameData::findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value) {
if(valueMapIndex==0) {
return 0; // The property does not have named values.
}
++valueMapIndex; // Skip the ByteTrie offset.
int32_t numRanges=valueMaps[valueMapIndex++];
if(numRanges<0x10) {
// Ranges of values.
for(; numRanges>0; --numRanges) {
// Read and skip the start and limit of this range.
int32_t start=valueMaps[valueMapIndex];
int32_t limit=valueMaps[valueMapIndex+1];
valueMapIndex+=2;
if(value<start) {
break;
}
if(value<limit) {
return valueMaps[valueMapIndex+value-start];
}
valueMapIndex+=limit-start; // Skip all entries for this range.
}
} else {
// List of values.
int32_t valuesStart=valueMapIndex;
int32_t nameGroupOffsetsStart=valueMapIndex+numRanges-0x10;
do {
int32_t v=valueMaps[valueMapIndex];
if(value<v) {
break;
}
if(value==v) {
return valueMaps[nameGroupOffsetsStart+valueMapIndex-valuesStart];
}
} while(++valueMapIndex<nameGroupOffsetsStart);
}
return 0;
}
const char *PropNameData::getName(const char *nameGroup, int32_t nameIndex) {
int32_t numNames=*nameGroup++;
if(nameIndex<0 || numNames<=nameIndex) {
return NULL;
}
const Offset* p = (const Offset*) getPointer(offset);
while (c-- > 0) {
if (*p++ < 0) return NULL;
// Skip nameIndex names.
for(; nameIndex>0; --nameIndex) {
nameGroup=uprv_strchr(nameGroup, 0)+1;
}
Offset a = *p;
if (a < 0) a = -a;
return (const char*) getPointerNull(a);
}
const ValueMap*
PropertyAliases::getValueMap(EnumValue prop) const {
NonContiguousEnumToOffset* e2o = (NonContiguousEnumToOffset*) getPointer(enumToValue_offset);
Offset a = e2o->getOffset(prop);
return (const ValueMap*) (a ? getPointerNull(a) : NULL);
}
inline const char*
PropertyAliases::getPropertyName(EnumValue prop,
UPropertyNameChoice choice) const {
NonContiguousEnumToOffset* e2n = (NonContiguousEnumToOffset*) getPointer(enumToName_offset);
return chooseNameInGroup(e2n->getOffset(prop), choice);
}
inline EnumValue
PropertyAliases::getPropertyEnum(const char* alias) const {
NameToEnum* n2e = (NameToEnum*) getPointer(nameToEnum_offset);
return n2e->getEnum(alias, *this);
}
inline const char*
PropertyAliases::getPropertyValueName(EnumValue prop,
EnumValue value,
UPropertyNameChoice choice) const {
const ValueMap* vm = getValueMap(prop);
if (!vm) return NULL;
Offset a;
if (vm->enumToName_offset) {
a = ((EnumToOffset*) getPointer(vm->enumToName_offset))->
getOffset(value);
} else {
a = ((NonContiguousEnumToOffset*) getPointer(vm->ncEnumToName_offset))->
getOffset(value);
if(*nameGroup==0) {
return NULL; // no name (Property[Value]Aliases.txt has "n/a")
}
return chooseNameInGroup(a, choice);
return nameGroup;
}
inline EnumValue
PropertyAliases::getPropertyValueEnum(EnumValue prop,
const char* alias) const {
const ValueMap* vm = getValueMap(prop);
if (!vm) return UCHAR_INVALID_CODE;
NameToEnum* n2e = (NameToEnum*) getPointer(vm->nameToEnum_offset);
return n2e->getEnum(alias, *this);
}
U_NAMESPACE_END
U_NAMESPACE_USE
//----------------------------------------------------------------------
// UDataMemory structures
static const PropertyAliases* PNAME = NULL;
static UDataMemory* UDATA = NULL;
//----------------------------------------------------------------------
// UDataMemory loading/unloading
/**
* udata callback to verify the zone data.
*/
U_CDECL_BEGIN
static UBool U_CALLCONV
isPNameAcceptable(void* /*context*/,
const char* /*type*/, const char* /*name*/,
const UDataInfo* info) {
return
info->size >= sizeof(UDataInfo) &&
info->isBigEndian == U_IS_BIG_ENDIAN &&
info->charsetFamily == U_CHARSET_FAMILY &&
info->dataFormat[0] == PNAME_SIG_0 &&
info->dataFormat[1] == PNAME_SIG_1 &&
info->dataFormat[2] == PNAME_SIG_2 &&
info->dataFormat[3] == PNAME_SIG_3 &&
info->formatVersion[0] == PNAME_FORMAT_VERSION;
}
static UBool U_CALLCONV pname_cleanup(void) {
if (UDATA) {
udata_close(UDATA);
UDATA = NULL;
UBool PropNameData::containsName(ByteTrie &trie, const char *name) {
if(name==NULL) {
return FALSE;
}
PNAME = NULL;
return TRUE;
}
U_CDECL_END
/**
* Load the property names data. Caller should check that data is
* not loaded BEFORE calling this function. Returns TRUE if the load
* succeeds.
*/
static UBool _load() {
UErrorCode ec = U_ZERO_ERROR;
UDataMemory* data =
udata_openChoice(0, PNAME_DATA_TYPE, PNAME_DATA_NAME,
isPNameAcceptable, 0, &ec);
if (U_SUCCESS(ec)) {
umtx_lock(NULL);
if (UDATA == NULL) {
UDATA = data;
PNAME = (const PropertyAliases*) udata_getMemory(UDATA);
ucln_common_registerCleanup(UCLN_COMMON_PNAME, pname_cleanup);
data = NULL;
UDictTrieResult result=UDICTTRIE_NO_VALUE;
char c;
while((c=*name++)!=0) {
c=uprv_invCharToLowercaseAscii(c);
// Ignore delimiters '-', '_', and ASCII White_Space.
if(c==0x2d || c==0x5f || c==0x20 || (0x09<=c && c<=0x0d)) {
continue;
}
umtx_unlock(NULL);
if(!UDICTTRIE_RESULT_HAS_NEXT(result)) {
return FALSE;
}
result=trie.next((uint8_t)c);
}
if (data) {
udata_close(data);
}
return PNAME!=NULL;
return UDICTTRIE_RESULT_HAS_VALUE(result);
}
/**
* Inline function that expands to code that does a lazy load of the
* property names data. If the data is already loaded, avoids an
* unnecessary function call. If the data is not loaded, call _load()
* to load it, and return TRUE if the load succeeds.
*/
static inline UBool load() {
UBool f;
UMTX_CHECK(NULL, (PNAME!=NULL), f);
return f || _load();
const char *PropNameData::getPropertyName(int32_t property, int32_t nameChoice) {
int32_t valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
return NULL; // Not a known property.
}
return getName(nameGroups+valueMaps[valueMapIndex], nameChoice);
}
const char *PropNameData::getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice) {
int32_t valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
return NULL; // Not a known property.
}
int32_t nameGroupOffset=findPropertyValueNameGroup(valueMaps[valueMapIndex+1], value);
if(nameGroupOffset==0) {
return NULL;
}
return getName(nameGroups+nameGroupOffset, nameChoice);
}
int32_t PropNameData::getPropertyOrValueEnum(int32_t byteTrieOffset, const char *alias) {
ByteTrie trie(byteTries+byteTrieOffset);
if(containsName(trie, alias)) {
return trie.getValue();
} else {
return UCHAR_INVALID_CODE;
}
}
int32_t PropNameData::getPropertyEnum(const char *alias) {
return getPropertyOrValueEnum(0, alias);
}
int32_t PropNameData::getPropertyValueEnum(int32_t property, const char *alias) {
int32_t valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
return UCHAR_INVALID_CODE; // Not a known property.
}
valueMapIndex=valueMaps[valueMapIndex+1];
if(valueMapIndex==0) {
return UCHAR_INVALID_CODE; // The property does not have named values.
}
// valueMapIndex is the start of the property's valueMap,
// where the first word is the ByteTrie offset.
return getPropertyOrValueEnum(valueMaps[valueMapIndex], alias);
}
//----------------------------------------------------------------------
// Public API implementation
// The C API is just a thin wrapper. Each function obtains a pointer
// to the singleton PropertyAliases, and calls the appropriate method
// on it. If it cannot obtain a pointer, because valid data is not
// available, then it returns NULL or UCHAR_INVALID_CODE.
U_CAPI const char* U_EXPORT2
u_getPropertyName(UProperty property,
UPropertyNameChoice nameChoice) {
return load() ? PNAME->getPropertyName(property, nameChoice)
: NULL;
return PropNameData::getPropertyName(property, nameChoice);
}
U_CAPI UProperty U_EXPORT2
u_getPropertyEnum(const char* alias) {
UProperty p = load() ? (UProperty) PNAME->getPropertyEnum(alias)
: UCHAR_INVALID_CODE;
return p;
return (UProperty)PropNameData::getPropertyEnum(alias);
}
U_CAPI const char* U_EXPORT2
u_getPropertyValueName(UProperty property,
int32_t value,
UPropertyNameChoice nameChoice) {
return load() ? PNAME->getPropertyValueName(property, value, nameChoice)
: NULL;
return PropNameData::getPropertyValueName(property, value, nameChoice);
}
U_CAPI int32_t U_EXPORT2
u_getPropertyValueEnum(UProperty property,
const char* alias) {
return load() ? PNAME->getPropertyValueEnum(property, alias)
: (int32_t)UCHAR_INVALID_CODE;
return PropNameData::getPropertyValueEnum(property, alias);
}
/* data swapping ------------------------------------------------------------ */
/*
* Sub-structure-swappers use the temp array (which is as large as the
* actual data) for intermediate storage,
* as well as to indicate if a particular structure has been swapped already.
* The temp array is initially reset to all 0.
* pos is the byte offset of the sub-structure in the inBytes/outBytes/temp arrays.
*/
int32_t
EnumToOffset::swap(const UDataSwapper *ds,
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
uint8_t *temp, int32_t pos,
UErrorCode *pErrorCode) {
const EnumToOffset *inMap;
EnumToOffset *outMap, *tempMap;
int32_t size;
tempMap=(EnumToOffset *)(temp+pos);
if(tempMap->enumStart!=0 || tempMap->enumLimit!=0) {
/* this map was swapped already */
size=tempMap->getSize();
return size;
}
inMap=(const EnumToOffset *)(inBytes+pos);
outMap=(EnumToOffset *)(outBytes+pos);
tempMap->enumStart=udata_readInt32(ds, inMap->enumStart);
tempMap->enumLimit=udata_readInt32(ds, inMap->enumLimit);
size=tempMap->getSize();
if(length>=0) {
if(length<(pos+size)) {
if(length<(int32_t)sizeof(PropertyAliases)) {
udata_printError(ds, "upname_swap(EnumToOffset): too few bytes (%d after header)\n"
" for pnames.icu EnumToOffset{%d..%d} at %d\n",
length, tempMap->enumStart, tempMap->enumLimit, pos);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
/* swap enumStart and enumLimit */
ds->swapArray32(ds, inMap, 2*sizeof(EnumValue), outMap, pErrorCode);
/* swap _offsetArray[] */
ds->swapArray16(ds, inMap->getOffsetArray(), (tempMap->enumLimit-tempMap->enumStart)*sizeof(Offset),
outMap->getOffsetArray(), pErrorCode);
}
return size;
}
int32_t
NonContiguousEnumToOffset::swap(const UDataSwapper *ds,
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
uint8_t *temp, int32_t pos,
UErrorCode *pErrorCode) {
const NonContiguousEnumToOffset *inMap;
NonContiguousEnumToOffset *outMap, *tempMap;
int32_t size;
tempMap=(NonContiguousEnumToOffset *)(temp+pos);
if(tempMap->count!=0) {
/* this map was swapped already */
size=tempMap->getSize();
return size;
}
inMap=(const NonContiguousEnumToOffset *)(inBytes+pos);
outMap=(NonContiguousEnumToOffset *)(outBytes+pos);
tempMap->count=udata_readInt32(ds, inMap->count);
size=tempMap->getSize();
if(length>=0) {
if(length<(pos+size)) {
if(length<(int32_t)sizeof(PropertyAliases)) {
udata_printError(ds, "upname_swap(NonContiguousEnumToOffset): too few bytes (%d after header)\n"
" for pnames.icu NonContiguousEnumToOffset[%d] at %d\n",
length, tempMap->count, pos);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
/* swap count and _enumArray[] */
length=(1+tempMap->count)*sizeof(EnumValue);
ds->swapArray32(ds, inMap, length,
outMap, pErrorCode);
/* swap _offsetArray[] */
pos+=length;
ds->swapArray16(ds, inBytes+pos, tempMap->count*sizeof(Offset),
outBytes+pos, pErrorCode);
}
return size;
}
struct NameAndIndex {
Offset name, index;
};
U_CDECL_BEGIN
typedef int32_t U_CALLCONV PropNameCompareFn(const char *name1, const char *name2);
struct CompareContext {
const char *chars;
PropNameCompareFn *propCompare;
};
static int32_t U_CALLCONV
upname_compareRows(const void *context, const void *left, const void *right) {
CompareContext *cmp=(CompareContext *)context;
return cmp->propCompare(cmp->chars+((const NameAndIndex *)left)->name,
cmp->chars+((const NameAndIndex *)right)->name);
}
U_CDECL_END
int32_t
NameToEnum::swap(const UDataSwapper *ds,
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
uint8_t *temp, int32_t pos,
UErrorCode *pErrorCode) {
const NameToEnum *inMap;
NameToEnum *outMap, *tempMap;
const EnumValue *inEnumArray;
EnumValue *outEnumArray;
const Offset *inNameArray;
Offset *outNameArray;
NameAndIndex *sortArray;
CompareContext cmp;
int32_t i, size, oldIndex;
tempMap=(NameToEnum *)(temp+pos);
if(tempMap->count!=0) {
/* this map was swapped already */
size=tempMap->getSize();
return size;
}
inMap=(const NameToEnum *)(inBytes+pos);
outMap=(NameToEnum *)(outBytes+pos);
tempMap->count=udata_readInt32(ds, inMap->count);
size=tempMap->getSize();
if(length>=0) {
if(length<(pos+size)) {
if(length<(int32_t)sizeof(PropertyAliases)) {
udata_printError(ds, "upname_swap(NameToEnum): too few bytes (%d after header)\n"
" for pnames.icu NameToEnum[%d] at %d\n",
length, tempMap->count, pos);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
/* swap count */
ds->swapArray32(ds, inMap, 4, outMap, pErrorCode);
inEnumArray=inMap->getEnumArray();
outEnumArray=outMap->getEnumArray();
inNameArray=(const Offset *)(inEnumArray+tempMap->count);
outNameArray=(Offset *)(outEnumArray+tempMap->count);
if(ds->inCharset==ds->outCharset) {
/* no need to sort, just swap the enum/name arrays */
ds->swapArray32(ds, inEnumArray, tempMap->count*4, outEnumArray, pErrorCode);
ds->swapArray16(ds, inNameArray, tempMap->count*2, outNameArray, pErrorCode);
return size;
}
/*
* The name and enum arrays are sorted by names and must be resorted
* if inCharset!=outCharset.
* We use the corresponding part of the temp array to sort an array
* of pairs of name offsets and sorting indexes.
* Then the sorting indexes are used to permutate-swap the name and enum arrays.
*
* The outBytes must already contain the swapped strings.
*/
sortArray=(NameAndIndex *)tempMap->getEnumArray();
for(i=0; i<tempMap->count; ++i) {
sortArray[i].name=udata_readInt16(ds, inNameArray[i]);
sortArray[i].index=(Offset)i;
}
/*
* use a stable sort to avoid shuffling of equal strings,
* which makes testing harder
*/
cmp.chars=(const char *)outBytes;
if (ds->outCharset==U_ASCII_FAMILY) {
cmp.propCompare=uprv_compareASCIIPropertyNames;
}
else {
cmp.propCompare=uprv_compareEBCDICPropertyNames;
}
uprv_sortArray(sortArray, tempMap->count, sizeof(NameAndIndex),
upname_compareRows, &cmp,
TRUE, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
udata_printError(ds, "upname_swap(NameToEnum).uprv_sortArray(%d items) failed\n",
tempMap->count);
return 0;
}
/* copy/swap/permutate _enumArray[] and _nameArray[] */
if(inEnumArray!=outEnumArray) {
for(i=0; i<tempMap->count; ++i) {
oldIndex=sortArray[i].index;
ds->swapArray32(ds, inEnumArray+oldIndex, 4, outEnumArray+i, pErrorCode);
ds->swapArray16(ds, inNameArray+oldIndex, 2, outNameArray+i, pErrorCode);
}
} else {
/*
* in-place swapping: need to permutate into a temporary array
* and then copy back to not destroy the data
*/
EnumValue *tempEnumArray;
Offset *oldIndexes;
/* write name offsets directly from sortArray */
for(i=0; i<tempMap->count; ++i) {
ds->writeUInt16((uint16_t *)outNameArray+i, (uint16_t)sortArray[i].name);
}
/*
* compress the oldIndexes into a separate array to make space for tempEnumArray
* the tempMap _nameArray becomes oldIndexes[], getting the index
* values from the 2D sortArray[],
* while sortArray=tempMap _enumArray[] becomes tempEnumArray[]
* this saves us allocating more memory
*
* it works because sizeof(NameAndIndex)<=sizeof(EnumValue)
* and because the nameArray[] can be used for oldIndexes[]
*/
tempEnumArray=(EnumValue *)sortArray;
oldIndexes=(Offset *)(sortArray+tempMap->count);
/* copy sortArray[].index values into oldIndexes[] */
for(i=0; i<tempMap->count; ++i) {
oldIndexes[i]=sortArray[i].index;
}
/* permutate inEnumArray[] into tempEnumArray[] */
for(i=0; i<tempMap->count; ++i) {
ds->swapArray32(ds, inEnumArray+oldIndexes[i], 4, tempEnumArray+i, pErrorCode);
}
/* copy tempEnumArray[] to outEnumArray[] */
uprv_memcpy(outEnumArray, tempEnumArray, tempMap->count*4);
}
}
return size;
}
int32_t
PropertyAliases::swap(const UDataSwapper *ds,
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
UErrorCode *pErrorCode) {
const PropertyAliases *inAliases;
PropertyAliases *outAliases;
PropertyAliases aliases;
const ValueMap *inValueMaps;
ValueMap *outValueMaps;
ValueMap valueMap;
int32_t i;
inAliases=(const PropertyAliases *)inBytes;
outAliases=(PropertyAliases *)outBytes;
/* read the input PropertyAliases - all 16-bit values */
for(i=0; i<(int32_t)sizeof(PropertyAliases)/2; ++i) {
((uint16_t *)&aliases)[i]=ds->readUInt16(((const uint16_t *)inBytes)[i]);
}
if(length>=0) {
if(length<aliases.total_size) {
udata_printError(ds, "upname_swap(): too few bytes (%d after header) for all of pnames.icu\n",
length);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
/* copy the data for inaccessible bytes */
if(inBytes!=outBytes) {
uprv_memcpy(outBytes, inBytes, aliases.total_size);
}
/* swap the PropertyAliases class fields */
ds->swapArray16(ds, inAliases, sizeof(PropertyAliases), outAliases, pErrorCode);
/* swap the name groups */
ds->swapArray16(ds, inBytes+aliases.nameGroupPool_offset,
aliases.stringPool_offset-aliases.nameGroupPool_offset,
outBytes+aliases.nameGroupPool_offset, pErrorCode);
/* swap the strings */
udata_swapInvStringBlock(ds, inBytes+aliases.stringPool_offset,
aliases.total_size-aliases.stringPool_offset,
outBytes+aliases.stringPool_offset, pErrorCode);
/*
* alloc uint8_t temp[total_size] and reset it
* swap each top-level struct, put at least the count fields into temp
* use subclass-specific swap() functions
* enumerate value maps, for each
* if temp does not have count!=0 yet
* read count, put it into temp
* swap the array(s)
* resort strings in name->enum maps
* swap value maps
*/
LocalMemory<uint8_t> temp;
if(temp.allocateInsteadAndReset(aliases.total_size)==NULL) {
udata_printError(ds, "upname_swap(): unable to allocate temp memory (%d bytes)\n",
aliases.total_size);
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return 0;
}
/* swap properties->name groups map */
NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
temp.getAlias(), aliases.enumToName_offset, pErrorCode);
/* swap name->properties map */
NameToEnum::swap(ds, inBytes, length, outBytes,
temp.getAlias(), aliases.nameToEnum_offset, pErrorCode);
/* swap properties->value maps map */
NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
temp.getAlias(), aliases.enumToValue_offset, pErrorCode);
/* enumerate all ValueMaps and swap them */
inValueMaps=(const ValueMap *)(inBytes+aliases.valueMap_offset);
outValueMaps=(ValueMap *)(outBytes+aliases.valueMap_offset);
for(i=0; i<aliases.valueMap_count; ++i) {
valueMap.enumToName_offset=udata_readInt16(ds, inValueMaps[i].enumToName_offset);
valueMap.ncEnumToName_offset=udata_readInt16(ds, inValueMaps[i].ncEnumToName_offset);
valueMap.nameToEnum_offset=udata_readInt16(ds, inValueMaps[i].nameToEnum_offset);
if(valueMap.enumToName_offset!=0) {
EnumToOffset::swap(ds, inBytes, length, outBytes,
temp.getAlias(), valueMap.enumToName_offset,
pErrorCode);
} else if(valueMap.ncEnumToName_offset!=0) {
NonContiguousEnumToOffset::swap(ds, inBytes, length, outBytes,
temp.getAlias(), valueMap.ncEnumToName_offset,
pErrorCode);
}
if(valueMap.nameToEnum_offset!=0) {
NameToEnum::swap(ds, inBytes, length, outBytes,
temp.getAlias(), valueMap.nameToEnum_offset,
pErrorCode);
}
}
/* swap the ValueMaps array itself */
ds->swapArray16(ds, inValueMaps, aliases.valueMap_count*sizeof(ValueMap),
outValueMaps, pErrorCode);
/* name groups and strings were swapped above */
}
return aliases.total_size;
}
U_CAPI int32_t U_EXPORT2
upname_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
const UDataInfo *pInfo;
int32_t headerSize;
const uint8_t *inBytes;
uint8_t *outBytes;
/* udata_swapDataHeader checks the arguments */
headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
/* check data format and format version */
pInfo=(const UDataInfo *)((const char *)inData+4);
if(!(
pInfo->dataFormat[0]==0x70 && /* dataFormat="pnam" */
pInfo->dataFormat[1]==0x6e &&
pInfo->dataFormat[2]==0x61 &&
pInfo->dataFormat[3]==0x6d &&
pInfo->formatVersion[0]==1
)) {
udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
pInfo->dataFormat[2], pInfo->dataFormat[3],
pInfo->formatVersion[0]);
*pErrorCode=U_UNSUPPORTED_ERROR;
return 0;
}
inBytes=(const uint8_t *)inData+headerSize;
outBytes=(uint8_t *)outData+headerSize;
if(length>=0) {
length-=headerSize;
if(length<(int32_t)sizeof(PropertyAliases)) {
udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n",
length);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
return headerSize+PropertyAliases::swap(ds, inBytes, length, outBytes, pErrorCode);
}
//eof
U_NAMESPACE_END

View file

@ -1,11 +1,12 @@
/*
**********************************************************************
* Copyright (c) 2002-2004, International Business Machines
* Copyright (c) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Author: Alan Liu
* Created: October 30 2002
* Since: ICU 2.4
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
**********************************************************************
*/
#ifndef PROPNAME_H
@ -13,6 +14,7 @@
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "bytetrie.h"
#include "udataswp.h"
#include "uprops.h"
@ -75,441 +77,134 @@ U_CDECL_END
#define PNAME_SIG_2 ((uint8_t)0x61) /* a */
#define PNAME_SIG_3 ((uint8_t)0x6D) /* m */
#define PNAME_FORMAT_VERSION ((int8_t)1) /* formatVersion[0] */
/**
* Swap pnames.icu. See udataswp.h.
* @internal
*/
U_CAPI int32_t U_EXPORT2
upname_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
#ifdef XP_CPLUSPLUS
class Builder;
U_NAMESPACE_BEGIN
/**
* An offset from the start of the pnames data to a contained entity.
* This must be a signed value, since negative offsets are used as an
* end-of-list marker. Offsets to actual objects are non-zero. A
* zero offset indicates an absent entry; this corresponds to aliases
* marked "n/a" in the original Unicode data files.
*/
typedef int16_t Offset; /* must be signed */
class PropNameData {
public:
enum {
// Byte offsets from the start of the data, after the generic header.
IX_VALUE_MAPS_OFFSET,
IX_BYTE_TRIES_OFFSET,
IX_NAME_GROUPS_OFFSET,
IX_RESERVED3_OFFSET,
IX_RESERVED4_OFFSET,
IX_TOTAL_SIZE,
#define MAX_OFFSET 0x7FFF
// Other values.
IX_MAX_NAME_LENGTH,
IX_RESERVED7,
IX_COUNT
};
/**
* A generic value for a property or property value. Typically an
* enum from uchar.h, but sometimes a non-enum value. It must be
* large enough to accomodate the largest enum value, which as of this
* writing is the largest general category mask. Need not be signed
* but may be. Typically it doesn't matter, since the caller will
* cast it to the proper type before use. Takes the special value
* UCHAR_INVALID_CODE for invalid input.
*/
typedef int32_t EnumValue;
static const char *getPropertyName(int32_t property, int32_t nameChoice);
static const char *getPropertyValueName(int32_t property, int32_t value, int32_t nameChoice);
/* ---------------------------------------------------------------------- */
/* ValueMap */
static int32_t getPropertyEnum(const char *alias);
static int32_t getPropertyValueEnum(int32_t property, const char *alias);
/**
* For any top-level property that has named values (binary and
* enumerated properties), there is a ValueMap object. This object
* maps from enum values to two other maps. One goes from value enums
* to value names. The other goes from value names to value enums.
*
* The value enum values may be contiguous or disjoint. If they are
* contiguous then the enumToName_offset is nonzero, and the
* ncEnumToName_offset is zero. Vice versa if the value enums are
* disjoint.
*
* There are n of these objects, where n is the number of binary
* properties + the number of enumerated properties.
*/
struct ValueMap {
private:
static int32_t findProperty(int32_t property);
static int32_t findPropertyValueNameGroup(int32_t valueMapIndex, int32_t value);
static const char *getName(const char *nameGroup, int32_t nameIndex);
static UBool containsName(ByteTrie &trie, const char *name);
/* -- begin pnames data -- */
/* Enum=>name EnumToOffset / NonContiguousEnumToOffset objects. */
/* Exactly one of these will be nonzero. */
Offset enumToName_offset;
Offset ncEnumToName_offset;
static int32_t getPropertyOrValueEnum(int32_t byteTrieOffset, const char *alias);
Offset nameToEnum_offset; /* Name=>enum data */
/* -- end pnames data -- */
static const int32_t indexes[];
static const int32_t valueMaps[];
static const uint8_t byteTries[];
static const char nameGroups[];
};
/* ---------------------------------------------------------------------- */
/* PropertyAliases class */
/**
* A class encapsulating access to the memory-mapped data representing
* property aliases and property value aliases (pnames). The class
* MUST have no v-table and declares certain methods inline -- small
* methods and methods that are called from only one point.
/*
* pnames.icu formatVersion 2
*
* The data members in this class correspond to the in-memory layout
* of the header of the pnames data.
* formatVersion 2 is new in ICU 4.8.
* In ICU 4.8, the pnames.icu data file is used only in ICU4J.
* ICU4C 4.8 has the same data structures hardcoded in source/common/propname_data.h.
*
* For documentation of pnames.icu formatVersion 1 see ICU4C 4.6 (2010-dec-01)
* or earlier versions of this header file (source/common/propname.h).
*
* The pnames.icu begins with the standard ICU DataHeader/UDataInfo.
* After that:
*
* int32_t indexes[8];
*
* (See the PropNameData::IX_... constants.)
*
* The first 6 indexes are byte offsets from the beginning of the data
* (beginning of indexes[]) to following structures.
* The length of each structure is the difference between its offset
* and the next one.
* All offsets are filled in: Where there is no data between two offsets,
* those two offsets are the same.
* The last offset (indexes[PropNameData::IX_TOTAL_SIZE]) indicates the
* total number of bytes in the file. (Not counting the standard headers.)
*
* The sixth index (indexes[PropNameData::IX_MAX_NAME_LENGTH]) has the
* maximum length of any Unicode property (or property value) alias.
* (Without normalization, that is, including underscores etc.)
*
* int32_t valueMaps[];
*
* The valueMaps[] begins with a map from UProperty enums to properties,
* followed by the per-property value maps from property values to names,
* for those properties that have named values.
* (Binary & enumerated, plus General_Category_Mask.)
*
* valueMaps[0] contains the number of UProperty enum ranges.
* For each range:
* int32_t start, limit -- first and last+1 UProperty enum of a dense range
* Followed by (limit-start) pairs of
* int32_t nameGroupOffset;
* Offset into nameGroups[] for the property's names/aliases.
* int32_t valueMapIndex;
* Offset of the property's value map in the valueMaps[] array.
* If the valueMapIndex is 0, then the property does not have named values.
*
* For each property's value map:
* int32_t byteTrieOffset; -- Offset into byteTries[] for name->value mapping.
* int32_t numRanges;
* If numRanges is in the range 1..15, then that many ranges of values follow.
* Per range:
* int32_t start, limit -- first and last+1 UProperty enum of a range
* Followed by (limit-start) entries of
* int32_t nameGroupOffset;
* Offset into nameGroups[] for the property value's names/aliases.
* If the nameGroupOffset is 0, then this is not a named value for this property.
* (That is, the ranges need not be dense.)
* If numRanges is >=0x10, then (numRanges-0x10) sorted values
* and then (numRanges-0x10) corresponding nameGroupOffsets follow.
* Values are sorted as signed integers.
* In this case, the set of values is dense; no nameGroupOffset will be 0.
*
* For both properties and property values, ranges are sorted by their start/limit values.
*
* uint8_t byteTries[];
*
* This is a sequence of ByteTrie structures, byte-serialized tries for
* mapping from names/aliases to values.
* The first one maps from property names/aliases to UProperty enum constants.
* The following ones are indexed by property value map byteTrieOffsets
* for mapping each property's names/aliases to their property values.
*
* char nameGroups[];
*
* This is a sequence of property name groups.
* Each group is a list of names/aliases (invariant-character strings) for
* one property or property value, in the order of UCharNameChoice.
* The first byte of each group is the number of names in the group.
* It is followed by that many NUL-terminated strings.
* The first string is for the short name; if there is no short name,
* then the first string is empty.
* The second string is the long name. Further strings are additional aliases.
*
* The first name group is for a property rather than a property value,
* so that a nameGroupOffset of 0 can be used to indicate "no value"
* in a property's sparse value ranges.
*/
class PropertyAliases {
/* -- begin pnames data -- */
/* Enum=>name EnumToOffset object for binary and enumerated */
/* properties */
Offset enumToName_offset;
/* Name=>enum data for binary & enumerated properties */
Offset nameToEnum_offset;
/* Enum=>offset EnumToOffset object mapping enumerated properties */
/* to ValueMap objects */
Offset enumToValue_offset;
/* The following are needed by external readers of this data. */
/* We don't use them ourselves. */
int16_t total_size; /* size in bytes excluding the udata header */
Offset valueMap_offset; /* offset to start of array */
int16_t valueMap_count; /* number of entries */
Offset nameGroupPool_offset; /* offset to start of array */
int16_t nameGroupPool_count; /* number of entries (not groups) */
Offset stringPool_offset; /* offset to start of pool */
int16_t stringPool_count; /* number of strings (not size in bytes) */
/* -- end pnames data -- */
friend class ::Builder;
const ValueMap* getValueMap(EnumValue prop) const;
const char* chooseNameInGroup(Offset offset,
UPropertyNameChoice choice) const;
public:
inline const int8_t* getPointer(Offset o) const {
return ((const int8_t*) this) + o;
}
inline const int8_t* getPointerNull(Offset o) const {
return o ? getPointer(o) : NULL;
}
inline const char* getPropertyName(EnumValue prop,
UPropertyNameChoice choice) const;
inline EnumValue getPropertyEnum(const char* alias) const;
inline const char* getPropertyValueName(EnumValue prop, EnumValue value,
UPropertyNameChoice choice) const;
inline EnumValue getPropertyValueEnum(EnumValue prop,
const char* alias) const;
static int32_t
swap(const UDataSwapper *ds,
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
UErrorCode *pErrorCode);
};
/* ---------------------------------------------------------------------- */
/* EnumToOffset */
/**
* A generic map from enum values to Offsets. The enum values must be
* contiguous, from enumStart to enumLimit. The Offset values may
* point to anything.
*/
class EnumToOffset {
/* -- begin pnames data -- */
EnumValue enumStart;
EnumValue enumLimit;
Offset _offsetArray; /* [array of enumLimit-enumStart] */
/* -- end pnames data -- */
friend class ::Builder;
Offset* getOffsetArray() {
return &_offsetArray;
}
const Offset* getOffsetArray() const {
return &_offsetArray;
}
static int32_t getSize(int32_t n) {
return sizeof(EnumToOffset) + sizeof(Offset) * (n - 1);
}
int32_t getSize() {
return getSize(enumLimit - enumStart);
}
public:
Offset getOffset(EnumValue enumProbe) const {
if (enumProbe < enumStart ||
enumProbe >= enumLimit) {
return 0; /* not found */
}
const Offset* p = getOffsetArray();
return p[enumProbe - enumStart];
}
static int32_t
swap(const UDataSwapper *ds,
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
uint8_t *temp, int32_t pos,
UErrorCode *pErrorCode);
};
/* ---------------------------------------------------------------------- */
/* NonContiguousEnumToOffset */
/**
* A generic map from enum values to Offsets. The enum values may be
* disjoint. If they are contiguous, an EnumToOffset should be used
* instead. The Offset values may point to anything.
*/
class NonContiguousEnumToOffset {
/* -- begin pnames data -- */
int32_t count;
EnumValue _enumArray; /* [array of count] */
/* Offset _offsetArray; // [array of count] after enumValue[count-1] */
/* -- end pnames data -- */
friend class ::Builder;
EnumValue* getEnumArray() {
return &_enumArray;
}
const EnumValue* getEnumArray() const {
return &_enumArray;
}
Offset* getOffsetArray() {
return (Offset*) (getEnumArray() + count);
}
const Offset* getOffsetArray() const {
return (Offset*) (getEnumArray() + count);
}
static int32_t getSize(int32_t n) {
return sizeof(int32_t) + (sizeof(EnumValue) + sizeof(Offset)) * n;
}
int32_t getSize() {
return getSize(count);
}
public:
Offset getOffset(EnumValue enumProbe) const {
const EnumValue* e = getEnumArray();
const Offset* p = getOffsetArray();
/* linear search; binary later if warranted */
/* (binary is not faster for short lists) */
for (int32_t i=0; i<count; ++i) {
if (e[i] < enumProbe) continue;
if (e[i] > enumProbe) break;
return p[i];
}
return 0; /* not found */
}
static int32_t
swap(const UDataSwapper *ds,
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
uint8_t *temp, int32_t pos,
UErrorCode *pErrorCode);
};
/* ---------------------------------------------------------------------- */
/* NameToEnum */
/**
* A map from names to enum values.
*/
class NameToEnum {
/* -- begin pnames data -- */
int32_t count; /* number of entries */
EnumValue _enumArray; /* [array of count] EnumValues */
/* Offset _nameArray; // [array of count] offsets to names */
/* -- end pnames data -- */
friend class ::Builder;
EnumValue* getEnumArray() {
return &_enumArray;
}
const EnumValue* getEnumArray() const {
return &_enumArray;
}
Offset* getNameArray() {
return (Offset*) (getEnumArray() + count);
}
const Offset* getNameArray() const {
return (Offset*) (getEnumArray() + count);
}
static int32_t getSize(int32_t n) {
return sizeof(int32_t) + (sizeof(Offset) + sizeof(EnumValue)) * n;
}
int32_t getSize() {
return getSize(count);
}
public:
EnumValue getEnum(const char* alias, const PropertyAliases& data) const {
const Offset* n = getNameArray();
const EnumValue* e = getEnumArray();
/* linear search; binary later if warranted */
/* (binary is not faster for short lists) */
for (int32_t i=0; i<count; ++i) {
const char* name = (const char*) data.getPointer(n[i]);
int32_t c = uprv_comparePropertyNames(alias, name);
if (c > 0) continue;
if (c < 0) break;
return e[i];
}
return UCHAR_INVALID_CODE;
}
static int32_t
swap(const UDataSwapper *ds,
const uint8_t *inBytes, int32_t length, uint8_t *outBytes,
uint8_t *temp, int32_t pos,
UErrorCode *pErrorCode);
};
/*----------------------------------------------------------------------
*
* In-memory layout. THIS IS NOT A STANDALONE DOCUMENT. It goes
* together with above C++ declarations and gives an overview.
*
* See above for definitions of Offset and EnumValue. Also, refer to
* above class declarations for the "bottom line" on data layout.
*
* Sizes:
* '*_offset' is an Offset (see above)
* 'count' members are typically int32_t (see above declarations)
* 'enumArray' is an array of EnumValue (see above)
* 'offsetArray' is an array of Offset (see above)
* 'nameArray' is an array of Offset (see above)
* 'enum*' is an EnumValue (see above)
* '*Array [x n]' means that *Array has n elements
*
* References:
* Instead of pointers, this flat data structure contains offsets.
* All offsets are relative to the start of 'header'. A notation
* is used to indicate what structure each offset points to:
* 'foo (>x)' the offset(s) in foo point to structure x
*
* Structures:
* Each structure is assigned a number, except for the header,
* which is called 'header'. The numbers are not contiguous
* for historical reasons. Some structures have sub-parts
* that are denoted with a letter, e.g., "5a".
*
* BEGIN LAYOUT
* ============
* header:
* enumToName_offset (>0)
* nameToEnum_offset (>2)
* enumToValue_offset (>3)
* (alignment padding build in to header)
*
* The header also contains the following, used by "external readers"
* like ICU4J and icuswap.
*
* // The following are needed by external readers of this data.
* // We don't use them ourselves.
* int16_t total_size; // size in bytes excluding the udata header
* Offset valueMap_offset; // offset to start of array
* int16_t valueMap_count; // number of entries
* Offset nameGroupPool_offset; // offset to start of array
* int16_t nameGroupPool_count; // number of entries (not groups)
* Offset stringPool_offset; // offset to start of pool
* int16_t stringPool_count; // number of strings (not size in bytes)
*
* 0: # NonContiguousEnumToOffset obj for props => name groups
* count
* enumArray [x count]
* offsetArray [x count] (>98)
*
* => pad to next 4-byte boundary
*
* (1: omitted -- no longer used)
*
* 2: # NameToEnum obj for binary & enumerated props
* count
* enumArray [x count]
* nameArray [x count] (>99)
*
* => pad to next 4-byte boundary
*
* 3: # NonContiguousEnumToOffset obj for enumerated props => ValueMaps
* count
* enumArray [x count]
* offsetArray [x count] (>4)
*
* => pad to next 4-byte boundary
*
* 4: # ValueMap array [x one for each enumerated prop i]
* enumToName_offset (>5a +2*i) one of these two is NULL, one is not
* ncEnumToName_offset (>5b +2*i)
* nameToEnums_offset (>6 +2*i)
*
* => pad to next 4-byte boundary
*
* for each enumerated prop (either 5a or 5b):
*
* 5a: # EnumToOffset for enumerated prop's values => name groups
* enumStart
* enumLimit
* offsetArray [x enumLimit - enumStart] (>98)
*
* => pad to next 4-byte boundary
*
* 5b: # NonContiguousEnumToOffset for enumerated prop's values => name groups
* count
* enumArray [x count]
* offsetArray [x count] (>98)
*
* => pad to next 4-byte boundary
*
* 6: # NameToEnum for enumerated prop's values
* count
* enumArray [x count]
* nameArray [x count] (>99)
*
* => pad to next 4-byte boundary
*
* 98: # name group pool {NGP}
* [array of Offset values] (>99)
*
* 99: # string pool {SP}
* [pool of nul-terminated char* strings]
*/
U_NAMESPACE_END
#endif /* C++ */
#endif

File diff suppressed because it is too large Load diff

View file

@ -34,11 +34,6 @@ typedef struct UBiDiProps UBiDiProps;
U_CFUNC const UBiDiProps *
ubidi_getSingleton(void);
U_CAPI int32_t
ubidi_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
U_CFUNC void
ubidi_addPropertyStarts(const UBiDiProps *bdp, const USetAdder *sa, UErrorCode *pErrorCode);

View file

@ -34,11 +34,6 @@ typedef struct UCaseProps UCaseProps;
U_CAPI const UCaseProps * U_EXPORT2
ucase_getSingleton(void);
U_CAPI int32_t U_EXPORT2
ucase_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
U_CFUNC void U_EXPORT2
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);

View file

@ -44,7 +44,6 @@ typedef enum ECleanupCommonType {
UCLN_COMMON_NORMALIZER2,
UCLN_COMMON_USET,
UCLN_COMMON_UNAMES,
UCLN_COMMON_PNAME,
UCLN_COMMON_UPROPS,
UCLN_COMMON_UCNV,
UCLN_COMMON_UCNV_IO,

View file

@ -0,0 +1,83 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: udicttrie.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec17
* created by: Markus W. Scherer
*/
#ifndef __UDICTTRIE_H__
#define __UDICTTRIE_H__
/**
* \file
* \brief C API: Helper definitions for dictionary trie APIs.
*/
#include "unicode/utypes.h"
/**
* Return values for ByteTrie::next(), UCharTrie::next() and similar methods.
* @see UDICTTRIE_RESULT_MATCHES
* @see UDICTTRIE_RESULT_HAS_VALUE
* @see UDICTTRIE_RESULT_HAS_NEXT
*/
enum UDictTrieResult {
/**
* The input unit(s) did not continue a matching string.
*/
UDICTTRIE_NO_MATCH,
/**
* The input unit(s) continued a matching string
* but there is no value for the string so far.
* (It is a prefix of a longer string.)
*/
UDICTTRIE_NO_VALUE,
/**
* The input unit(s) continued a matching string
* and there is a value for the string so far.
* This value will be returned by getValue().
* No further input byte/unit can continue a matching string.
*/
UDICTTRIE_HAS_FINAL_VALUE,
/**
* The input unit(s) continued a matching string
* and there is a value for the string so far.
* This value will be returned by getValue().
* Another input byte/unit can continue a matching string.
*/
UDICTTRIE_HAS_VALUE
};
/**
* Same as (result!=UDICTTRIE_NO_MATCH).
* @param result A result from ByteTrie::first(), UCharTrie::next() etc.
* @return true if the input bytes/units so far are part of a matching string/byte sequence.
*/
#define UDICTTRIE_RESULT_MATCHES(result) ((result)!=UDICTTRIE_NO_MATCH)
/**
* Equivalent to (result==UDICTTRIE_HAS_VALUE || result==UDICTTRIE_HAS_FINAL_VALUE) but
* this macro evaluates result exactly once.
* @param result A result from ByteTrie::first(), UCharTrie::next() etc.
* @return true if there is a value for the input bytes/units so far.
* @see ByteTrie::getValue
* @see UCharTrie::getValue
*/
#define UDICTTRIE_RESULT_HAS_VALUE(result) ((result)>=UDICTTRIE_HAS_FINAL_VALUE)
/**
* Equivalent to (result==UDICTTRIE_NO_VALUE || result==UDICTTRIE_HAS_VALUE) but
* this macro evaluates result exactly once.
* @param result A result from ByteTrie::first(), UCharTrie::next() etc.
* @return true if another input byte/unit can continue a matching string.
*/
#define UDICTTRIE_RESULT_HAS_NEXT(result) ((result)&1)
#endif /* __UDICTTRIE_H__ */

View file

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1997-2009, International Business Machines
* Copyright (C) 1997-2010, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* Date Name Description
@ -866,6 +866,11 @@ uhash_hashUCharsN(const UChar *str, int32_t length) {
STRING_HASH(UChar, str, length, *p);
}
U_CAPI int32_t U_EXPORT2
uhash_hashCharsN(const char *str, int32_t length) {
STRING_HASH(char, str, length, *p);
}
U_CAPI int32_t U_EXPORT2
uhash_hashChars(const UHashTok key) {
STRING_HASH(uint8_t, key.pointer, uprv_strlen((char*)p), *p);

View file

@ -583,6 +583,9 @@ uhash_hashChars(const UHashTok key);
U_CAPI int32_t U_EXPORT2
uhash_hashUCharsN(const UChar *key, int32_t length);
U_CAPI int32_t U_EXPORT2
uhash_hashCharsN(const char *key, int32_t length);
/**
* Generate a case-insensitive hash code for a null-terminated char*
* string. If the string is not null-terminated do not use this

View file

@ -104,6 +104,29 @@ static const uint8_t ebcdicFromAscii[256]={
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
/* Same as asciiFromEbcdic[] except maps all letters to lowercase. */
static const uint8_t lowercaseAsciiFromEbcdic[256]={
0x00, 0x01, 0x02, 0x03, 0x00, 0x09, 0x00, 0x7f, 0x00, 0x00, 0x00, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x00, 0x0a, 0x08, 0x00, 0x18, 0x19, 0x00, 0x00, 0x1c, 0x1d, 0x1e, 0x1f,
0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x17, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x06, 0x07,
0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x14, 0x15, 0x00, 0x1a,
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
0x26, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x5e,
0x2d, 0x2f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x60, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
0x00, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00,
0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5d, 0x00, 0x5d, 0x00, 0x00,
0x7b, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x7d, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x7c, 0x00, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
/*
* Bit sets indicating which characters of the ASCII repertoire
* (by ASCII/Unicode code) are "invariant".
@ -535,6 +558,10 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2) {
}
}
U_CAPI char U_EXPORT2
uprv_ebcdicToLowercaseAscii(char c) {
return (char)lowercaseAsciiFromEbcdic[(uint8_t)c];
}
U_INTERNAL uint8_t* U_EXPORT2
uprv_aestrncpy(uint8_t *dst, const uint8_t *src, int32_t n)

View file

@ -83,6 +83,26 @@ uprv_compareInvEbcdicAsAscii(const char *s1, const char *s2);
# error Unknown charset family!
#endif
/**
* Converts an EBCDIC invariant character to lowercase ASCII.
* @internal
*/
U_INTERNAL char U_EXPORT2
uprv_ebcdicToLowercaseAscii(char c);
/**
* \def uprv_invCharToLowercaseAscii
* Converts an invariant character to lowercase ASCII.
* @internal
*/
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
# define uprv_invCharToLowercaseAscii uprv_asciitolower
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
# define uprv_invCharToLowercaseAscii uprv_ebcdicToLowercaseAscii
#else
# error Unknown charset family!
#endif
/**
* Copy EBCDIC to ASCII
* @internal

View file

@ -1160,7 +1160,6 @@
#define uplug_setPlugLevel U_ICU_ENTRY_POINT_RENAME(uplug_setPlugLevel)
#define uplug_setPlugName U_ICU_ENTRY_POINT_RENAME(uplug_setPlugName)
#define uplug_setPlugNoUnload U_ICU_ENTRY_POINT_RENAME(uplug_setPlugNoUnload)
#define upname_swap U_ICU_ENTRY_POINT_RENAME(upname_swap)
#define uprops_getSource U_ICU_ENTRY_POINT_RENAME(uprops_getSource)
#define upropsvec_addPropertyStarts U_ICU_ENTRY_POINT_RENAME(upropsvec_addPropertyStarts)
#define uprv_aestrncpy U_ICU_ENTRY_POINT_RENAME(uprv_aestrncpy)

View file

@ -162,15 +162,6 @@ enum {
UNORM_NX_CJK_COMPAT=2
};
/**
* Swap unorm.icu. See udataswp.h.
* @internal
*/
U_CAPI int32_t U_EXPORT2
unorm_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Description of the format of unorm.icu version 2.3.
*

View file

@ -397,15 +397,6 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
*/
/**
* Swap the ICU Unicode properties file. See uchar.c.
* @internal
*/
U_CAPI int32_t U_EXPORT2
uprops_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Swap the ICU Unicode character names file. See uchar.c.
* @internal

View file

@ -7748,7 +7748,7 @@ then
fi
# output the Makefiles
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/icu.pc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gentest/Makefile tools/gennorm2/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icuinfo/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/dicttrieperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/DateFmtPerf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
@ -8489,6 +8489,7 @@ do
"test/letest/Makefile") CONFIG_FILES="$CONFIG_FILES test/letest/Makefile" ;;
"test/perf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/Makefile" ;;
"test/perf/collationperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/collationperf/Makefile" ;;
"test/perf/dicttrieperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/dicttrieperf/Makefile" ;;
"test/perf/ubrkperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/ubrkperf/Makefile" ;;
"test/perf/charperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/charperf/Makefile" ;;
"test/perf/convperf/Makefile") CONFIG_FILES="$CONFIG_FILES test/perf/convperf/Makefile" ;;

View file

@ -1354,6 +1354,7 @@ AC_CONFIG_FILES([icudefs.mk \
test/letest/Makefile \
test/perf/Makefile \
test/perf/collationperf/Makefile \
test/perf/dicttrieperf/Makefile \
test/perf/ubrkperf/Makefile \
test/perf/charperf/Makefile \
test/perf/convperf/Makefile \

View file

@ -226,8 +226,10 @@ package390: $(OUTTMPDIR)/icudata390.lst $(PKGDATA_LIST) ./icupkg.inc packagedata
## DAT files - Misc. data files.
# 2005-may-05 Removed Unicode properties files (unorm.icu, uprops.icu, ucase.icu, ubidi.icu)
# from data build. See Jitterbug 4497. (makedata.mak revision 1.117)
# 2010-dec Removed pnames.icu.
# These are now hardcoded in ICU4C and only loaded in ICU4J.
#
DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu nfc.nrm nfkc.nrm nfkc_cf.nrm uts46.nrm
DAT_FILES_SHORT=unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu nfc.nrm nfkc.nrm nfkc_cf.nrm uts46.nrm
DAT_FILES=$(DAT_FILES_SHORT:%=$(BUILDDIR)/%)
## BRK files
@ -411,7 +413,7 @@ COLL_FILES_LIST=$(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT)
BRK_FILES_LIST=$(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT)
LOCALE_FILES_LIST= $(RES_FILES_SHORT) $(LANG_FILES_SHORT) $(REGION_FILES_SHORT) $(ZONE_FILES_SHORT)
MISC_FILES_LIST=$(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(CNV_FILES_SHORT_SPECIAL) $(CURR_FILES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) $(CFU_FILES_SHORT)
UNI_CORE_DATA=uprops.icu ucase.icu ubidi.icu
UNI_CORE_DATA=pnames.icu uprops.icu ucase.icu ubidi.icu
UNI_CORE_TARGET_DATA=$(UNI_CORE_DATA:%=$(BUILDDIR)/%)
ifneq ($(INCLUDE_UNI_CORE_DATA),)
@ -494,7 +496,7 @@ $(BUILDDIR)/coll/%.icu: $(SRCDATADIR)/in/coll/%.icu
#################################################### SPP
# SPP FILES
$(BUILDDIR)/%.spp: $(SPREPSRCDIR)/%.txt $(TOOLBINDIR)/gensprep$(TOOLEXEEXT) $(BUILDDIR)/unames.icu $(BUILDDIR)/pnames.icu
$(BUILDDIR)/%.spp: $(SPREPSRCDIR)/%.txt $(TOOLBINDIR)/gensprep$(TOOLEXEEXT) $(BUILDDIR)/unames.icu
$(INVOKE) $(TOOLBINDIR)/gensprep -d $(BUILDDIR) -i $(BUILDDIR) -s $(SPREPSRCDIR) -b $(@F:%.spp=%) -m $(UNICODEDATADIR) -u 3.2.0 $(<F)
#################################################### BRK
@ -753,11 +755,10 @@ clean-resindex:
$(BUILDDIR)/$(INDEX_NAME).res: $(INDEX_FILE) $(TOOLBINDIR)/genrb$(TOOLEXEEXT)
$(INVOKE) $(TOOLBINDIR)/genrb $(GENRBOPTS) -i $(BUILDDIR) -d $(BUILDDIR) $(INDEX_FILE)
# The core Unicode properties files (uprops.icu, ucase.icu, ubidi.icu)
# The core Unicode properties files (pnames.icu, uprops.icu, ucase.icu, ubidi.icu)
# are hardcoded in the common DLL and therefore not included in the data package any more.
# They are not built by default but need to be built for ICU4J data and for getting the .c source files
# when updating the Unicode data.
# Changed in Makefile.in revision 1.147. See Jitterbug 4497.
uni-core-data: build-dir $(UNI_CORE_TARGET_DATA)
@echo Unicode .icu files built to $(BUILDDIR)
@ -778,7 +779,7 @@ JAR=jar
# - package them into the .jar file
$(OUTDIR)/icu4j/icudata.jar: build-dir packagedata $(OUTTMPDIR)/$(ICUDATA_PLATFORM_NAME).dat uni-core-data
mkdir -p $(OUTDIR)/icu4j/com/ibm/icu/impl/data/$(ICUDATA_BASENAME_VERSION)b
echo ubidi.icu ucase.icu uprops.icu > $(OUTDIR)/icu4j/add.txt
echo pnames.icu ubidi.icu ucase.icu uprops.icu > $(OUTDIR)/icu4j/add.txt
$(INVOKE) $(TOOLBINDIR)/icupkg $(OUTTMPDIR)/$(ICUDATA_PLATFORM_NAME).dat $(OUTDIR)/icu4j/$(ICUDATA_BASENAME_VERSION)b.dat -a $(OUTDIR)/icu4j/add.txt -s $(BUILDDIR) -x '*' -tb -d $(OUTDIR)/icu4j/com/ibm/icu/impl/data/$(ICUDATA_BASENAME_VERSION)b
$(JAR) cf $(OUTDIR)/icu4j/icudata.jar -C $(OUTDIR)/icu4j com/ibm/icu/impl/data/$(ICUDATA_BASENAME_VERSION)b
@ -821,9 +822,9 @@ pkgdataMakefile:
###########
########### 390 (z/OS) support
UCMFILES390=ebcdic-xml-us.ucm ibm-37_P100-1995.ucm ibm-1047_P100-1995.ucm ibm-4909_P100-1999.ucm
# used to depend on uprops.icu ucase.icu ubidi.icu
# see Jitterbug 4497
ALLFILES390=pnames.icu cnvalias.icu $(UCMFILES390:.ucm=.cnv)
# used to depend on pnames.icu uprops.icu ucase.icu ubidi.icu
# These are now hardcoded in ICU4C and only loaded in ICU4J.
ALLFILES390=cnvalias.icu $(UCMFILES390:.ucm=.cnv)
$(OUTTMPDIR)/icudata390.lst: $(SRCLISTDEPS)
@echo "generating $@ (list of 390 data files)"

Binary file not shown.

View file

@ -486,9 +486,10 @@ ALL : GODATA "$(ICU_LIB_TARGET)" "$(TESTDATAOUT)\testdata.dat"
# They are not built by default but need to be built for ICU4J data and for getting the .c source files
# when updating the Unicode data.
# Changed in makedata.mak revision 1.117. See Jitterbug 4497.
# 2010-dec Removed pnames.icu.
# Command line:
# C:\svn\icuproj\icu\trunk\source\data>nmake -f makedata.mak ICUMAKE=C:\svn\icuproj\icu\trunk\source\data\ CFG=x86\Debug uni-core-data
uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu"
uni-core-data: GODATA "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu"
@echo Unicode .icu files built to "$(ICUBLD_PKG)"
# Build the ICU4J icudata.jar and testdata.jar.
@ -501,7 +502,7 @@ uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(IC
# - package them into the .jar file
"$(ICUOUT)\icu4j\icudata.jar": GODATA "$(ICUOUT)\$(ICUPKG).dat" uni-core-data
if not exist "$(ICUOUT)\icu4j\com\ibm\icu\impl\data\$(U_ICUDATA_NAME)b" mkdir "$(ICUOUT)\icu4j\com\ibm\icu\impl\data\$(U_ICUDATA_NAME)b"
echo ubidi.icu ucase.icu uprops.icu > "$(ICUOUT)\icu4j\add.txt"
echo pnames.icu ubidi.icu ucase.icu uprops.icu > "$(ICUOUT)\icu4j\add.txt"
"$(ICUPBIN)\icupkg" "$(ICUOUT)\$(ICUPKG).dat" "$(ICUOUT)\icu4j\$(U_ICUDATA_NAME)b.dat" -a "$(ICUOUT)\icu4j\add.txt" -s "$(ICUBLD_PKG)" -x * -tb -d "$(ICUOUT)\icu4j\com\ibm\icu\impl\data\$(U_ICUDATA_NAME)b"
"$(JAR)" cf "$(ICUOUT)\icu4j\icudata.jar" -C "$(ICUOUT)\icu4j" com\ibm\icu\impl\data\$(U_ICUDATA_NAME)b
@ -586,11 +587,10 @@ icu4j-data-install :
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
-@erase "$(ICUTMP)\$(ICUPKG).dat"
!ELSE
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) $(CNV_FILES_SPECIAL) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\uts46.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
@echo Building icu data
cd "$(ICUBLD_PKG)"
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
pnames.icu
unames.icu
confusables.cfu
$(ICUCOL)\ucadata.icu
@ -985,9 +985,8 @@ $(UCM_SOURCE_SPECIAL): {"$(ICUTOOLS)\makeconv\$(CFG)"}makeconv.exe
# See Jitterbug 4497 for details.
$(MISC_SOURCE) $(RB_FILES) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(BRK_RES_FILES) $(TRANSLIT_RES_FILES): {"$(ICUTOOLS)\genrb\$(CFG)"}genrb.exe "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu"
# This used to depend on "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu"
# This data is now hard coded as a part of the library.
# See Jitterbug 4497 for details.
$(BRK_SOURCE) : "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\nfc.nrm"
# This used to depend on "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu"
# These are now hardcoded in ICU4C and only loaded in ICU4J.
$(BRK_SOURCE) : "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\nfc.nrm"
!ENDIF

View file

@ -13,6 +13,20 @@
---------------------------------------------------------------------------- ***
Unicode 6.1 update
(TODO: Copy and adjust most of the 6.0 update instructions,
except retain this following section in this new form.
So far, this just documents the new procedure for building the property names data.)
* run genpname
(builds both pnames.icu and propname_data.h)
- ~/svn.icu/tools/trunk/bld/unicode$ c/genpname/genpname -v -d ~/svn.icu/trunk/src/source/data/in
- ~/svn.icu/tools/trunk/bld/unicode$ c/genpname/genpname -v -d ~/svn.icu/trunk/src/source/common --csource
- rebuild ICU & tools
---------------------------------------------------------------------------- ***
Unicode 6.0 update
*** related ICU Trac tickets

View file

@ -52,7 +52,6 @@
#include "ucol_swp.h"
#include "ucnv_bld.h"
#include "sprpimpl.h"
#include "propname.h"
#include "rbbidata.h"
/* swapping implementation in i18n */
@ -1310,10 +1309,16 @@ static const struct {
{"thaidict", "ctd", triedict_swap},
#endif
/* the last item should not be #if'ed so that it can reliably omit the last comma */
#if 0
/*
* Starting with ICU 4.8, the Unicode property (value) aliases data
* is hardcoded in the ICU4C common library.
* The swapper was moved to the toolutil library for swapping for ICU4J.
*/
/* Unicode properties */
{"pnames", "icu", upname_swap},
#endif
#if 0
/*
* Starting with ICU4C 3.4, the core Unicode properties files
@ -1336,6 +1341,7 @@ static const struct {
{"confusables", "cfu", uspoof_swap},
#endif
{"unames", "icu", uchar_swapNames}
/* the last item should not be #if'ed so that it can reliably omit the last comma */
};
/* Large enough for the largest swappable data item. */
@ -1673,6 +1679,7 @@ TestSwapData() {
uprv_strcat(name, swapCases[i].type);
pData=udata_open(pkg, swapCases[i].type, nm, &errorCode);
if(U_SUCCESS(errorCode)) {
TestSwapCase(pData, name, swapCases[i].swapFn, buffer, buffer+SWAP_BUFFER_SIZE);
udata_close(pData);

View file

@ -50,6 +50,7 @@ sdtfmtts.o svccoll.o tchcfmt.o selfmts.o \
tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
tsmthred.o tsnmfmt.o tsputil.o tstnrapi.o tstnorm.o tzbdtest.o \
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o strcase.o transtst.o strtest.o thcoll.o \
bytetrietest.o uchartrietest.o \
itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o \
testutil.o transrt.o trnserr.o normconf.o sfwdchit.o \
jamotest.o srchtest.o reptest.o regextst.o \

View file

@ -0,0 +1,843 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetrietest.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov16
* created by: Markus W. Scherer
*/
#include <string.h>
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "bytetrie.h"
#include "bytetriebuilder.h"
#include "bytetrieiterator.h"
#include "intltest.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
struct StringAndValue {
const char *s;
int32_t value;
};
class ByteTrieTest : public IntlTest {
public:
ByteTrieTest() {}
virtual ~ByteTrieTest();
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
void TestBuilder();
void TestEmpty();
void Test_a();
void Test_a_ab();
void TestShortestBranch();
void TestBranches();
void TestLongSequence();
void TestLongBranch();
void TestValuesForState();
void TestCompact();
StringPiece buildMonthsTrie(ByteTrieBuilder &builder, UDictTrieBuildOption buildOption);
void TestHasUniqueValue();
void TestGetNextBytes();
void TestIteratorFromBranch();
void TestIteratorFromLinearMatch();
void TestTruncatingIteratorFromRoot();
void TestTruncatingIteratorFromLinearMatchShort();
void TestTruncatingIteratorFromLinearMatchLong();
void checkData(const StringAndValue data[], int32_t dataLength);
void checkData(const StringAndValue data[], int32_t dataLength, UDictTrieBuildOption buildOption);
StringPiece buildTrie(const StringAndValue data[], int32_t dataLength,
ByteTrieBuilder &builder, UDictTrieBuildOption buildOption);
void checkFirst(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkNext(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkNextWithState(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkNextString(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkIterator(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkIterator(ByteTrieIterator &iter, const StringAndValue data[], int32_t dataLength);
};
extern IntlTest *createByteTrieTest() {
return new ByteTrieTest();
}
ByteTrieTest::~ByteTrieTest() {
}
void ByteTrieTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
if(exec) {
logln("TestSuite ByteTrieTest: ");
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(TestBuilder);
TESTCASE_AUTO(TestEmpty);
TESTCASE_AUTO(Test_a);
TESTCASE_AUTO(Test_a_ab);
TESTCASE_AUTO(TestShortestBranch);
TESTCASE_AUTO(TestBranches);
TESTCASE_AUTO(TestLongSequence);
TESTCASE_AUTO(TestLongBranch);
TESTCASE_AUTO(TestValuesForState);
TESTCASE_AUTO(TestCompact);
TESTCASE_AUTO(TestHasUniqueValue);
TESTCASE_AUTO(TestGetNextBytes);
TESTCASE_AUTO(TestIteratorFromBranch);
TESTCASE_AUTO(TestIteratorFromLinearMatch);
TESTCASE_AUTO(TestTruncatingIteratorFromRoot);
TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchShort);
TESTCASE_AUTO(TestTruncatingIteratorFromLinearMatchLong);
TESTCASE_AUTO_END;
}
void ByteTrieTest::TestBuilder() {
IcuTestErrorCode errorCode(*this, "TestBuilder()");
ByteTrieBuilder builder;
builder.build(UDICTTRIE_BUILD_FAST, errorCode);
if(errorCode.reset()!=U_INDEX_OUTOFBOUNDS_ERROR) {
errln("ByteTrieBuilder().build() did not set U_INDEX_OUTOFBOUNDS_ERROR");
return;
}
builder.add("=", 0, errorCode).add("=", 1, errorCode).build(UDICTTRIE_BUILD_FAST, errorCode);
if(errorCode.reset()!=U_ILLEGAL_ARGUMENT_ERROR) {
errln("ByteTrieBuilder.build() did not detect duplicates");
return;
}
}
void ByteTrieTest::TestEmpty() {
static const StringAndValue data[]={
{ "", 0 }
};
checkData(data, LENGTHOF(data));
}
void ByteTrieTest::Test_a() {
static const StringAndValue data[]={
{ "a", 1 }
};
checkData(data, LENGTHOF(data));
}
void ByteTrieTest::Test_a_ab() {
static const StringAndValue data[]={
{ "a", 1 },
{ "ab", 100 }
};
checkData(data, LENGTHOF(data));
}
void ByteTrieTest::TestShortestBranch() {
static const StringAndValue data[]={
{ "a", 1000 },
{ "b", 2000 }
};
checkData(data, LENGTHOF(data));
}
void ByteTrieTest::TestBranches() {
static const StringAndValue data[]={
{ "a", 0x10 },
{ "cc", 0x40 },
{ "e", 0x100 },
{ "ggg", 0x400 },
{ "i", 0x1000 },
{ "kkkk", 0x4000 },
{ "n", 0x10000 },
{ "ppppp", 0x40000 },
{ "r", 0x100000 },
{ "sss", 0x200000 },
{ "t", 0x400000 },
{ "uu", 0x800000 },
{ "vv", 0x7fffffff },
{ "zz", 0x80000000 }
};
for(int32_t length=2; length<=LENGTHOF(data); ++length) {
infoln("TestBranches length=%d", (int)length);
checkData(data, length);
}
}
void ByteTrieTest::TestLongSequence() {
static const StringAndValue data[]={
{ "a", -1 },
// sequence of linear-match nodes
{ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", -2 },
// more than 256 bytes
{ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", -3 }
};
checkData(data, LENGTHOF(data));
}
void ByteTrieTest::TestLongBranch() {
// Split-branch and interesting compact-integer values.
static const StringAndValue data[]={
{ "a", -2 },
{ "b", -1 },
{ "c", 0 },
{ "d2", 1 },
{ "f", 0x3f },
{ "g", 0x40 },
{ "h", 0x41 },
{ "j23", 0x1900 },
{ "j24", 0x19ff },
{ "j25", 0x1a00 },
{ "k2", 0x1a80 },
{ "k3", 0x1aff },
{ "l234567890", 0x1b00 },
{ "l234567890123", 0x1b01 },
{ "nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn", 0x10ffff },
{ "oooooooooooooooooooooooooooooooooooooooooooooooooooooo", 0x110000 },
{ "pppppppppppppppppppppppppppppppppppppppppppppppppppppp", 0x120000 },
{ "r", 0x333333 },
{ "s2345", 0x4444444 },
{ "t234567890", 0x77777777 },
{ "z", 0x80000001 }
};
checkData(data, LENGTHOF(data));
}
void ByteTrieTest::TestValuesForState() {
// Check that saveState() and resetToState() interact properly
// with next() and current().
static const StringAndValue data[]={
{ "a", -1 },
{ "ab", -2 },
{ "abc", -3 },
{ "abcd", -4 },
{ "abcde", -5 },
{ "abcdef", -6 }
};
checkData(data, LENGTHOF(data));
}
void ByteTrieTest::TestCompact() {
// Duplicate trailing strings and values provide opportunities for compacting.
static const StringAndValue data[]={
{ "+", 0 },
{ "+august", 8 },
{ "+december", 12 },
{ "+july", 7 },
{ "+june", 6 },
{ "+november", 11 },
{ "+october", 10 },
{ "+september", 9 },
{ "-", 0 },
{ "-august", 8 },
{ "-december", 12 },
{ "-july", 7 },
{ "-june", 6 },
{ "-november", 11 },
{ "-october", 10 },
{ "-september", 9 },
// The l+n branch (with its sub-nodes) is a duplicate but will be written
// both times because each time it follows a different linear-match node.
{ "xjuly", 7 },
{ "xjune", 6 }
};
checkData(data, LENGTHOF(data));
}
StringPiece ByteTrieTest::buildMonthsTrie(ByteTrieBuilder &builder, UDictTrieBuildOption buildOption) {
// All types of nodes leading to the same value,
// for code coverage of recursive functions.
// In particular, we need a lot of branches on some single level
// to exercise a split-branch node.
static const StringAndValue data[]={
{ "august", 8 },
{ "jan", 1 },
{ "jan.", 1 },
{ "jana", 1 },
{ "janbb", 1 },
{ "janc", 1 },
{ "janddd", 1 },
{ "janee", 1 },
{ "janef", 1 },
{ "janf", 1 },
{ "jangg", 1 },
{ "janh", 1 },
{ "janiiii", 1 },
{ "janj", 1 },
{ "jankk", 1 },
{ "jankl", 1 },
{ "jankmm", 1 },
{ "janl", 1 },
{ "janm", 1 },
{ "jannnnnnnnnnnnnnnnnnnnnnnnnnnnn", 1 },
{ "jano", 1 },
{ "janpp", 1 },
{ "janqqq", 1 },
{ "janr", 1 },
{ "januar", 1 },
{ "january", 1 },
{ "july", 7 },
{ "jun", 6 },
{ "jun.", 6 },
{ "june", 6 }
};
return buildTrie(data, LENGTHOF(data), builder, buildOption);
}
void ByteTrieTest::TestHasUniqueValue() {
ByteTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_FAST);
if(sp.empty()) {
return; // buildTrie() reported an error
}
ByteTrie trie(sp.data());
int32_t uniqueValue;
if(trie.hasUniqueValue(uniqueValue)) {
errln("unique value at root");
}
trie.next('j');
trie.next('a');
trie.next('n');
// hasUniqueValue() directly after next()
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=1) {
errln("not unique value 1 after \"jan\"");
}
trie.first('j');
trie.next('u');
if(trie.hasUniqueValue(uniqueValue)) {
errln("unique value after \"ju\"");
}
if(trie.next('n')!=UDICTTRIE_HAS_VALUE || 6!=trie.getValue()) {
errln("not normal value 6 after \"jun\"");
}
// hasUniqueValue() after getValue()
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=6) {
errln("not unique value 6 after \"jun\"");
}
// hasUniqueValue() from within a linear-match node
trie.first('a');
trie.next('u');
if(!trie.hasUniqueValue(uniqueValue) || uniqueValue!=8) {
errln("not unique value 8 after \"au\"");
}
}
void ByteTrieTest::TestGetNextBytes() {
ByteTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_SMALL);
if(sp.empty()) {
return; // buildTrie() reported an error
}
ByteTrie trie(sp.data());
char buffer[40];
CheckedArrayByteSink sink(buffer, LENGTHOF(buffer));
int32_t count=trie.getNextBytes(sink);
if(count!=2 || sink.NumberOfBytesAppended()!=2 || buffer[0]!='a' || buffer[1]!='j') {
errln("months getNextBytes()!=[aj] at root");
}
trie.next('j');
trie.next('a');
trie.next('n');
// getNextBytes() directly after next()
count=trie.getNextBytes(sink.Reset());
buffer[count]=0;
if(count!=20 || sink.NumberOfBytesAppended()!=20 || 0!=strcmp(buffer, ".abcdefghijklmnopqru")) {
errln("months getNextBytes()!=[.abcdefghijklmnopqru] after \"jan\"");
}
// getNextBytes() after getValue()
trie.getValue(); // next() had returned UDICTTRIE_HAS_VALUE.
memset(buffer, 0, sizeof(buffer));
count=trie.getNextBytes(sink.Reset());
if(count!=20 || sink.NumberOfBytesAppended()!=20 || 0!=strcmp(buffer, ".abcdefghijklmnopqru")) {
errln("months getNextBytes()!=[.abcdefghijklmnopqru] after \"jan\"+getValue()");
}
// getNextBytes() from a linear-match node
trie.next('u');
memset(buffer, 0, sizeof(buffer));
count=trie.getNextBytes(sink.Reset());
if(count!=1 || sink.NumberOfBytesAppended()!=1 || buffer[0]!='a') {
errln("months getNextBytes()!=[a] after \"janu\"");
}
trie.next('a');
memset(buffer, 0, sizeof(buffer));
count=trie.getNextBytes(sink.Reset());
if(count!=1 || sink.NumberOfBytesAppended()!=1 || buffer[0]!='r') {
errln("months getNextBytes()!=[r] after \"janua\"");
}
trie.next('r');
trie.next('y');
// getNextBytes() after a final match
count=trie.getNextBytes(sink.Reset());
if(count!=0 || sink.NumberOfBytesAppended()!=0) {
errln("months getNextBytes()!=[] after \"january\"");
}
}
void ByteTrieTest::TestIteratorFromBranch() {
ByteTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_FAST);
if(sp.empty()) {
return; // buildTrie() reported an error
}
ByteTrie trie(sp.data());
// Go to a branch node.
trie.next('j');
trie.next('a');
trie.next('n');
IcuTestErrorCode errorCode(*this, "TestIteratorFromBranch()");
ByteTrieIterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the suffixes
// following "jan".
static const StringAndValue data[]={
{ "", 1 },
{ ".", 1 },
{ "a", 1 },
{ "bb", 1 },
{ "c", 1 },
{ "ddd", 1 },
{ "ee", 1 },
{ "ef", 1 },
{ "f", 1 },
{ "gg", 1 },
{ "h", 1 },
{ "iiii", 1 },
{ "j", 1 },
{ "kk", 1 },
{ "kl", 1 },
{ "kmm", 1 },
{ "l", 1 },
{ "m", 1 },
{ "nnnnnnnnnnnnnnnnnnnnnnnnnnnn", 1 },
{ "o", 1 },
{ "pp", 1 },
{ "qqq", 1 },
{ "r", 1 },
{ "uar", 1 },
{ "uary", 1 }
};
checkIterator(iter, data, LENGTHOF(data));
// Reset, and we should get the same result.
logln("after iter.reset()");
checkIterator(iter.reset(), data, LENGTHOF(data));
}
void ByteTrieTest::TestIteratorFromLinearMatch() {
ByteTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_SMALL);
if(sp.empty()) {
return; // buildTrie() reported an error
}
ByteTrie trie(sp.data());
// Go into a linear-match node.
trie.next('j');
trie.next('a');
trie.next('n');
trie.next('u');
trie.next('a');
IcuTestErrorCode errorCode(*this, "TestIteratorFromLinearMatch()");
ByteTrieIterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the suffixes
// following "janua".
static const StringAndValue data[]={
{ "r", 1 },
{ "ry", 1 }
};
checkIterator(iter, data, LENGTHOF(data));
// Reset, and we should get the same result.
logln("after iter.reset()");
checkIterator(iter.reset(), data, LENGTHOF(data));
}
void ByteTrieTest::TestTruncatingIteratorFromRoot() {
ByteTrieBuilder builder;
StringPiece sp=buildMonthsTrie(builder, UDICTTRIE_BUILD_FAST);
if(sp.empty()) {
return; // buildTrie() reported an error
}
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromRoot()");
ByteTrieIterator iter(sp.data(), 4, errorCode);
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the first 4 characters
// of each string, and no string duplicates from the truncation.
static const StringAndValue data[]={
{ "augu", -1 },
{ "jan", 1 },
{ "jan.", 1 },
{ "jana", 1 },
{ "janb", -1 },
{ "janc", 1 },
{ "jand", -1 },
{ "jane", -1 },
{ "janf", 1 },
{ "jang", -1 },
{ "janh", 1 },
{ "jani", -1 },
{ "janj", 1 },
{ "jank", -1 },
{ "janl", 1 },
{ "janm", 1 },
{ "jann", -1 },
{ "jano", 1 },
{ "janp", -1 },
{ "janq", -1 },
{ "janr", 1 },
{ "janu", -1 },
{ "july", 7 },
{ "jun", 6 },
{ "jun.", 6 },
{ "june", 6 }
};
checkIterator(iter, data, LENGTHOF(data));
// Reset, and we should get the same result.
logln("after iter.reset()");
checkIterator(iter.reset(), data, LENGTHOF(data));
}
void ByteTrieTest::TestTruncatingIteratorFromLinearMatchShort() {
static const StringAndValue data[]={
{ "abcdef", 10 },
{ "abcdepq", 200 },
{ "abcdeyz", 3000 }
};
ByteTrieBuilder builder;
StringPiece sp=buildTrie(data, LENGTHOF(data), builder, UDICTTRIE_BUILD_FAST);
if(sp.empty()) {
return; // buildTrie() reported an error
}
ByteTrie trie(sp.data());
// Go into a linear-match node.
trie.next('a');
trie.next('b');
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchShort()");
// Truncate within the linear-match node.
ByteTrieIterator iter(trie, 2, errorCode);
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
return;
}
static const StringAndValue expected[]={
{ "cd", -1 }
};
checkIterator(iter, expected, LENGTHOF(expected));
// Reset, and we should get the same result.
logln("after iter.reset()");
checkIterator(iter.reset(), expected, LENGTHOF(expected));
}
void ByteTrieTest::TestTruncatingIteratorFromLinearMatchLong() {
static const StringAndValue data[]={
{ "abcdef", 10 },
{ "abcdepq", 200 },
{ "abcdeyz", 3000 }
};
ByteTrieBuilder builder;
StringPiece sp=buildTrie(data, LENGTHOF(data), builder, UDICTTRIE_BUILD_FAST);
if(sp.empty()) {
return; // buildTrie() reported an error
}
ByteTrie trie(sp.data());
// Go into a linear-match node.
trie.next('a');
trie.next('b');
trie.next('c');
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchLong()");
// Truncate after the linear-match node.
ByteTrieIterator iter(trie, 3, errorCode);
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trie) constructor")) {
return;
}
static const StringAndValue expected[]={
{ "def", 10 },
{ "dep", -1 },
{ "dey", -1 }
};
checkIterator(iter, expected, LENGTHOF(expected));
// Reset, and we should get the same result.
logln("after iter.reset()");
checkIterator(iter.reset(), expected, LENGTHOF(expected));
}
void ByteTrieTest::checkData(const StringAndValue data[], int32_t dataLength) {
logln("checkData(dataLength=%d, fast)", (int)dataLength);
checkData(data, dataLength, UDICTTRIE_BUILD_FAST);
logln("checkData(dataLength=%d, small)", (int)dataLength);
checkData(data, dataLength, UDICTTRIE_BUILD_SMALL);
}
void ByteTrieTest::checkData(const StringAndValue data[], int32_t dataLength, UDictTrieBuildOption buildOption) {
ByteTrieBuilder builder;
StringPiece sp=buildTrie(data, dataLength, builder, buildOption);
if(sp.empty()) {
return; // buildTrie() reported an error
}
checkFirst(sp, data, dataLength);
checkNext(sp, data, dataLength);
checkNextWithState(sp, data, dataLength);
checkNextString(sp, data, dataLength);
checkIterator(sp, data, dataLength);
}
StringPiece ByteTrieTest::buildTrie(const StringAndValue data[], int32_t dataLength,
ByteTrieBuilder &builder, UDictTrieBuildOption buildOption) {
IcuTestErrorCode errorCode(*this, "buildTrie()");
// Add the items to the trie builder in an interesting (not trivial, not random) order.
int32_t index, step;
if(dataLength&1) {
// Odd number of items.
index=dataLength/2;
step=2;
} else if((dataLength%3)!=0) {
// Not a multiple of 3.
index=dataLength/5;
step=3;
} else {
index=dataLength-1;
step=-1;
}
builder.clear();
for(int32_t i=0; i<dataLength; ++i) {
builder.add(data[index].s, data[index].value, errorCode);
index=(index+step)%dataLength;
}
StringPiece sp(builder.build(buildOption, errorCode));
if(!errorCode.logIfFailureAndReset("add()/build()")) {
builder.add("zzz", 999, errorCode);
if(errorCode.reset()!=U_NO_WRITE_PERMISSION) {
errln("builder.build().add(zzz) did not set U_NO_WRITE_PERMISSION");
}
}
logln("serialized trie size: %ld bytes\n", (long)sp.length());
return sp;
}
void ByteTrieTest::checkFirst(const StringPiece &trieBytes,
const StringAndValue data[], int32_t dataLength) {
ByteTrie trie(trieBytes.data());
for(int32_t i=0; i<dataLength; ++i) {
int c=(uint8_t)*data[i].s;
if(c==0) {
continue; // skip empty string
}
UDictTrieResult firstResult=trie.first(c);
int32_t firstValue=UDICTTRIE_RESULT_HAS_VALUE(firstResult) ? trie.getValue() : -1;
UDictTrieResult nextResult=trie.next((uint8_t)data[i].s[1]);
if(firstResult!=trie.reset().next(c) ||
firstResult!=trie.current() ||
firstValue!=(UDICTTRIE_RESULT_HAS_VALUE(firstResult) ? trie.getValue() : -1) ||
nextResult!=trie.next((uint8_t)data[i].s[1])
) {
errln("trie.first(%c)!=trie.reset().next(same) for %s",
c, data[i].s);
}
}
}
void ByteTrieTest::checkNext(const StringPiece &trieBytes,
const StringAndValue data[], int32_t dataLength) {
ByteTrie trie(trieBytes.data());
ByteTrie::State state;
for(int32_t i=0; i<dataLength; ++i) {
int32_t stringLength= (i&1) ? -1 : strlen(data[i].s);
UDictTrieResult result;
if( !UDICTTRIE_RESULT_HAS_VALUE(result=trie.next(data[i].s, stringLength)) ||
result!=trie.current()
) {
errln("trie does not seem to contain %s", data[i].s);
} else if(trie.getValue()!=data[i].value) {
errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx",
data[i].s,
(long)trie.getValue(), (long)trie.getValue(),
(long)data[i].value, (long)data[i].value);
} else if(result!=trie.current() || trie.getValue()!=data[i].value) {
errln("trie value for %s changes when repeating current()/getValue()", data[i].s);
}
trie.reset();
stringLength=strlen(data[i].s);
result=trie.current();
for(int32_t j=0; j<stringLength; ++j) {
if(!UDICTTRIE_RESULT_HAS_NEXT(result)) {
errln("trie.current()!=hasNext before end of %s (at index %d)", data[i].s, j);
break;
}
if(result==UDICTTRIE_HAS_VALUE) {
trie.getValue();
if(trie.current()!=UDICTTRIE_HAS_VALUE) {
errln("trie.getValue().current()!=UDICTTRIE_HAS_VALUE before end of %s (at index %d)", data[i].s, j);
break;
}
}
result=trie.next(data[i].s[j]);
if(!UDICTTRIE_RESULT_MATCHES(result)) {
errln("trie.next()=UDICTTRIE_NO_MATCH before end of %s (at index %d)", data[i].s, j);
break;
}
if(result!=trie.current()) {
errln("trie.next()!=following current() before end of %s (at index %d)", data[i].s, j);
break;
}
}
if(!UDICTTRIE_RESULT_HAS_VALUE(result)) {
errln("trie.next()!=hasValue at the end of %s", data[i].s);
continue;
}
trie.getValue();
if(result!=trie.current()) {
errln("trie.current() != current()+getValue()+current() after end of %s",
data[i].s);
}
// Compare the final current() with whether next() can actually continue.
trie.saveState(state);
UBool nextContinues=FALSE;
for(int32_t c=0x20; c<0x7f; ++c) {
if(trie.resetToState(state).next(c)) {
nextContinues=TRUE;
break;
}
}
if((result==UDICTTRIE_HAS_VALUE)!=nextContinues) {
errln("(trie.current()==UDICTTRIE_HAS_VALUE) contradicts "
"(trie.next(some UChar)!=UDICTTRIE_NO_MATCH) after end of %s", data[i].s);
}
trie.reset();
}
}
void ByteTrieTest::checkNextWithState(const StringPiece &trieBytes,
const StringAndValue data[], int32_t dataLength) {
ByteTrie trie(trieBytes.data());
ByteTrie::State noState, state;
for(int32_t i=0; i<dataLength; ++i) {
if((i&1)==0) {
// This should have no effect.
trie.resetToState(noState);
}
const char *expectedString=data[i].s;
int32_t stringLength=strlen(expectedString);
int32_t partialLength=stringLength/3;
for(int32_t j=0; j<partialLength; ++j) {
if(!UDICTTRIE_RESULT_MATCHES(trie.next(expectedString[j]))) {
errln("trie.next()=UDICTTRIE_NO_MATCH for a prefix of %s", data[i].s);
return;
}
}
trie.saveState(state);
UDictTrieResult resultAtState=trie.current();
UDictTrieResult result;
int32_t valueAtState=-99;
if(UDICTTRIE_RESULT_HAS_VALUE(resultAtState)) {
valueAtState=trie.getValue();
}
result=trie.next(0); // mismatch
if(result!=UDICTTRIE_NO_MATCH || result!=trie.current()) {
errln("trie.next(0) matched after part of %s", data[i].s);
}
if( resultAtState!=trie.resetToState(state).current() ||
(UDICTTRIE_RESULT_HAS_VALUE(resultAtState) && valueAtState!=trie.getValue())
) {
errln("trie.next(part of %s) changes current()/getValue() after "
"saveState/next(0)/resetToState",
data[i].s);
} else if(!UDICTTRIE_RESULT_HAS_VALUE(
result=trie.next(expectedString+partialLength,
stringLength-partialLength)) ||
result!=trie.current()) {
errln("trie.next(rest of %s) does not seem to contain %s after "
"saveState/next(0)/resetToState",
data[i].s);
} else if(!UDICTTRIE_RESULT_HAS_VALUE(
result=trie.resetToState(state).
next(expectedString+partialLength,
stringLength-partialLength)) ||
result!=trie.current()) {
errln("trie does not seem to contain %s after saveState/next(rest)/resetToState",
data[i].s);
} else if(trie.getValue()!=data[i].value) {
errln("trie value for %s is %ld=0x%lx instead of expected %ld=0x%lx",
data[i].s,
(long)trie.getValue(), (long)trie.getValue(),
(long)data[i].value, (long)data[i].value);
}
trie.reset();
}
}
// next(string) is also tested in other functions,
// but here we try to go partway through the string, and then beyond it.
void ByteTrieTest::checkNextString(const StringPiece &trieBytes,
const StringAndValue data[], int32_t dataLength) {
ByteTrie trie(trieBytes.data());
for(int32_t i=0; i<dataLength; ++i) {
const char *expectedString=data[i].s;
int32_t stringLength=strlen(expectedString);
if(!trie.next(expectedString, stringLength/2)) {
errln("trie.next(up to middle of string)=UDICTTRIE_NO_MATCH for %s", data[i].s);
continue;
}
// Test that we stop properly at the end of the string.
if(trie.next(expectedString+stringLength/2, stringLength+1-stringLength/2)) {
errln("trie.next(string+NUL)!=UDICTTRIE_NO_MATCH for %s", data[i].s);
}
trie.reset();
}
}
void ByteTrieTest::checkIterator(const StringPiece &trieBytes,
const StringAndValue data[], int32_t dataLength) {
IcuTestErrorCode errorCode(*this, "checkIterator()");
ByteTrieIterator iter(trieBytes.data(), 0, errorCode);
if(errorCode.logIfFailureAndReset("ByteTrieIterator(trieBytes) constructor")) {
return;
}
checkIterator(iter, data, dataLength);
}
void ByteTrieTest::checkIterator(ByteTrieIterator &iter,
const StringAndValue data[], int32_t dataLength) {
IcuTestErrorCode errorCode(*this, "checkIterator()");
for(int32_t i=0; i<dataLength; ++i) {
if(!iter.hasNext()) {
errln("trie iterator hasNext()=FALSE for item %d: %s", (int)i, data[i].s);
break;
}
UBool hasNext=iter.next(errorCode);
if(errorCode.logIfFailureAndReset("trie iterator next() for item %d: %s", (int)i, data[i].s)) {
break;
}
if(!hasNext) {
errln("trie iterator next()=FALSE for item %d: %s", (int)i, data[i].s);
break;
}
if(iter.getString()!=StringPiece(data[i].s)) {
errln("trie iterator next().getString()=%s but expected %s for item %d",
iter.getString().data(), data[i].s, (int)i);
}
if(iter.getValue()!=data[i].value) {
errln("trie iterator next().getValue()=%ld=0x%lx but expected %ld=0x%lx for item %d: %s",
(long)iter.getValue(), (long)iter.getValue(),
(long)data[i].value, (long)data[i].value,
(int)i, data[i].s);
}
}
if(iter.hasNext()) {
errln("trie iterator hasNext()=TRUE after all items");
}
UBool hasNext=iter.next(errorCode);
errorCode.logIfFailureAndReset("trie iterator next() after all items");
if(hasNext) {
errln("trie iterator next()=TRUE after all items");
}
}

View file

@ -223,6 +223,8 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="bytetrietest.cpp" />
<ClCompile Include="uchartrietest.cpp" />
<ClCompile Include="itrbbi.cpp" />
<ClCompile Include="rbbiapts.cpp" />
<ClCompile Include="rbbitst.cpp" />
@ -529,4 +531,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

View file

@ -29,7 +29,9 @@
#include "aliastst.h"
#include "usettest.h"
extern IntlTest *createByteTrieTest();
static IntlTest *createLocalPointerTest();
extern IntlTest *createUCharTrieTest();
#define CASE(id, test) case id: \
name = #test; \
@ -68,6 +70,22 @@ void IntlTestUtilities::runIndexedTest( int32_t index, UBool exec, const char* &
callTest(*test, par);
}
break;
case 17:
name = "ByteTrieTest";
if (exec) {
logln("TestSuite ByteTrieTest---"); logln();
LocalPointer<IntlTest> test(createByteTrieTest());
callTest(*test, par);
}
break;
case 18:
name = "UCharTrieTest";
if (exec) {
logln("TestSuite UCharTrieTest---"); logln();
LocalPointer<IntlTest> test(createUCharTrieTest());
callTest(*test, par);
}
break;
default: name = ""; break; //needed to end loop
}
}

File diff suppressed because it is too large Load diff

View file

@ -18,7 +18,7 @@ subdir = test/perf
## Files to remove for 'make clean'
CLEANFILES = *~
SUBDIRS = collationperf charperf normperf ubrkperf unisetperf usetperf ustrperf utfperf utrie2perf DateFmtPerf
SUBDIRS = collationperf charperf dicttrieperf normperf ubrkperf unisetperf usetperf ustrperf utfperf utrie2perf DateFmtPerf
# Subdirs that support 'xperf'
XSUBDIRS = DateFmtPerf

View file

@ -0,0 +1,79 @@
## Makefile.in for ICU - test/perf/dicttrieperf
## Copyright (c) 2001-2010, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../../..
include $(top_builddir)/icudefs.mk
## Build directory information
subdir = test/perf/dicttrieperf
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(DEPS)
## Target information
TARGET = dicttrieperf
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/tools/toolutil -I$(top_srcdir)/tools/ctestfw
LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUUC) $(LIBICUTOOLUTIL) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = dicttrieperf.o
DEPS = $(OBJECTS:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check check-local
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET)
install-local:
dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(OBJECTS) $(TARGET)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK.cc) -o $@ $^ $(LIBS)
$(POST_BUILD_STEP)
invoke:
ICU_DATA=$${ICU_DATA:-$(top_builddir)/data/} TZ=PST8PDT $(INVOKE) $(INVOCATION)
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
ifneq ($(patsubst %install,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif
endif

View file

@ -0,0 +1,766 @@
/*
**********************************************************************
* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: dicttrieperf.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec09
* created by: Markus W. Scherer
*
* Performance test program for dictionary-type tries.
*
* Usage from within <ICU build tree>/test/perf/dicttrieperf/ :
* (Linux)
* make
* export LD_LIBRARY_PATH=../../../lib:../../../stubdata:../../../tools/ctestfw
* ./dicttrieperf --sourcedir <ICU build tree>/data/out/tmp --passes 3 --iterations 1000
* or
* ./dicttrieperf -f <ICU source tree>/source/data/brkitr/thaidict.txt --passes 3 --iterations 250
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/uperf.h"
#include "unicode/utext.h"
#include "bytetrie.h"
#include "bytetriebuilder.h"
#include "charstr.h"
#include "package.h"
#include "toolutil.h"
#include "triedict.h"
#include "ucbuf.h" // struct ULine
#include "uchartrie.h"
#include "uchartriebuilder.h"
#include "uoptions.h"
#include "uvectr32.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
// Test object.
class DictionaryTriePerfTest : public UPerfTest {
public:
DictionaryTriePerfTest(int32_t argc, const char *argv[], UErrorCode &status)
: UPerfTest(argc, argv, NULL, 0, "", status), numTextLines(0) {
if(hasFile()) {
getLines(status);
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]>=0x41) {
++numTextLines;
// Remove trailing CR LF.
int32_t len=lines[i].len;
UChar c;
while(len>0 && ((c=lines[i].name[len-1])==0xa || c==0xd)) {
--len;
}
lines[i].len=len;
}
}
}
}
virtual UPerfFunction *runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
const char *getSourceDir() const { return sourceDir; }
UBool hasFile() const { return ucharBuf!=NULL; }
const ULine *getCachedLines() const { return lines; }
int32_t getNumLines() const { return numLines; }
int32_t numTextLines; // excluding comment lines
};
// Performance test function object.
// Loads icudt46l.dat (or whatever its current versioned filename)
// from the -s or --sourcedir path.
class PackageLookup : public UPerfFunction {
protected:
PackageLookup(const DictionaryTriePerfTest &perf) {
IcuToolErrorCode errorCode("PackageLookup()");
CharString filename(perf.getSourceDir(), errorCode);
int32_t filenameLength=filename.length();
if(filenameLength>0 && filename[filenameLength-1]!=U_FILE_SEP_CHAR &&
filename[filenameLength-1]!=U_FILE_ALT_SEP_CHAR) {
filename.append(U_FILE_SEP_CHAR, errorCode);
}
filename.append(U_ICUDATA_NAME, errorCode);
filename.append(".dat", errorCode);
pkg.readPackage(filename.data());
}
public:
virtual ~PackageLookup() {}
// virtual void call(UErrorCode* pErrorCode) { ... }
virtual long getOperationsPerIteration() {
return pkg.getItemCount();
}
// virtual long getEventsPerIteration();
protected:
Package pkg;
};
struct TOCEntry {
int32_t nameOffset, dataOffset;
};
// Similar to ICU 4.6 offsetTOCLookupFn() (in ucmndata.c).
static int32_t simpleBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) {
int32_t start=0;
int32_t limit=count;
int32_t lastNumber=limit;
for(;;) {
int32_t number=(start+limit)/2;
if(lastNumber==number) { // have we moved?
return -1; // not found
}
lastNumber=number;
int32_t cmp=strcmp(s, names+toc[number].nameOffset);
if(cmp<0) {
limit=number;
} else if(cmp>0) {
start=number;
} else { // found s
return number;
}
}
}
class BinarySearchPackageLookup : public PackageLookup {
public:
BinarySearchPackageLookup(const DictionaryTriePerfTest &perf)
: PackageLookup(perf) {
IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
int32_t count=pkg.getItemCount();
toc=new TOCEntry[count];
for(int32_t i=0; i<count; ++i) {
toc[i].nameOffset=itemNames.length();
toc[i].dataOffset=i; // arbitrary value, see toc comment below
// The Package class removes the "icudt46l/" prefix.
// We restore that here for a fair performance test.
const char *name=pkg.getItem(i)->name;
itemNames.append("icudt46l/", errorCode);
itemNames.append(name, strlen(name)+1, errorCode);
}
printf("size of item names: %6ld\n", (long)itemNames.length());
printf("size of TOC: %6ld\n", (long)(count*8));
printf("total index size: %6ld\n", (long)(itemNames.length()+count*8));
}
virtual ~BinarySearchPackageLookup() {
delete[] toc;
}
virtual void call(UErrorCode * /*pErrorCode*/) {
int32_t count=pkg.getItemCount();
const char *itemNameChars=itemNames.data();
const char *name=itemNameChars;
for(int32_t i=0; i<count; ++i) {
if(simpleBinarySearch(name, itemNameChars, toc, count)<0) {
fprintf(stderr, "item not found: %s\n", name);
}
name=strchr(name, 0)+1;
}
}
protected:
CharString itemNames;
// toc imitates a .dat file's array of UDataOffsetTOCEntry
// with nameOffset and dataOffset.
// We don't need the dataOffsets, but we want to imitate the real
// memory density, to measure equivalent CPU cache usage.
TOCEntry *toc;
};
#ifndef MIN
#define MIN(a,b) (((a)<(b)) ? (a) : (b))
#endif
// Compare strings where we know the shared prefix length,
// and advance the prefix length as we find that the strings share even more characters.
static int32_t strcmpAfterPrefix(const char *s1, const char *s2, int32_t &prefixLength) {
int32_t pl=prefixLength;
s1+=pl;
s2+=pl;
int32_t cmp=0;
for(;;) {
int32_t c1=(uint8_t)*s1++;
int32_t c2=(uint8_t)*s2++;
cmp=c1-c2;
if(cmp!=0 || c1==0) { // different or done
break;
}
++pl; // increment shared same-prefix length
}
prefixLength=pl;
return cmp;
}
static int32_t prefixBinarySearch(const char *s, const char *names, const TOCEntry *toc, int32_t count) {
if(count==0) {
return -1;
}
int32_t start=0;
int32_t limit=count;
// Remember the shared prefix between s, start and limit,
// and don't compare that shared prefix again.
// The shared prefix should get longer as we narrow the [start, limit[ range.
int32_t startPrefixLength=0;
int32_t limitPrefixLength=0;
// Prime the prefix lengths so that we don't keep prefixLength at 0 until
// both the start and limit indexes have moved.
// At the same time, we find if s is one of the start and (limit-1) names,
// and if not, exclude them from the actual binary search.
if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, startPrefixLength)) {
return 0;
}
++start;
--limit;
if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, limitPrefixLength)) {
return limit;
}
while(start<limit) {
int32_t i=(start+limit)/2;
int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength);
int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, prefixLength);
if(cmp<0) {
limit=i;
limitPrefixLength=prefixLength;
} else if(cmp==0) {
return i;
} else {
start=i;
startPrefixLength=prefixLength;
}
}
return -1;
}
class PrefixBinarySearchPackageLookup : public BinarySearchPackageLookup {
public:
PrefixBinarySearchPackageLookup(const DictionaryTriePerfTest &perf)
: BinarySearchPackageLookup(perf) {}
virtual void call(UErrorCode * /*pErrorCode*/) {
int32_t count=pkg.getItemCount();
const char *itemNameChars=itemNames.data();
const char *name=itemNameChars;
for(int32_t i=0; i<count; ++i) {
if(prefixBinarySearch(name, itemNameChars, toc, count)<0) {
fprintf(stderr, "item not found: %s\n", name);
}
name=strchr(name, 0)+1;
}
}
};
static int32_t byteTrieLookup(const char *s, const char *nameTrieBytes) {
ByteTrie trie(nameTrieBytes);
if(UDICTTRIE_RESULT_HAS_VALUE(trie.next(s, -1))) {
return trie.getValue();
} else {
return -1;
}
}
class ByteTriePackageLookup : public PackageLookup {
public:
ByteTriePackageLookup(const DictionaryTriePerfTest &perf)
: PackageLookup(perf) {
IcuToolErrorCode errorCode("BinarySearchPackageLookup()");
int32_t count=pkg.getItemCount();
for(int32_t i=0; i<count; ++i) {
// The Package class removes the "icudt46l/" prefix.
// We restore that here for a fair performance test.
// We store all full names so that we do not have to reconstruct them
// in the call() function.
const char *name=pkg.getItem(i)->name;
int32_t offset=itemNames.length();
itemNames.append("icudt46l/", errorCode);
itemNames.append(name, -1, errorCode);
// As value, set the data item index.
// In a real implementation, we would use that to get the
// start and limit offset of the data item.
StringPiece fullName(itemNames.toStringPiece());
fullName.remove_prefix(offset);
builder.add(fullName, i, errorCode);
// NUL-terminate the name for call() to find the next one.
itemNames.append(0, errorCode);
}
int32_t length=builder.build(UDICTTRIE_BUILD_SMALL, errorCode).length();
printf("size of ByteTrie: %6ld\n", (long)length);
// count+1: +1 for the last-item limit offset which we should have always had
printf("size of dataOffsets:%6ld\n", (long)((count+1)*4));
printf("total index size: %6ld\n", (long)(length+(count+1)*4));
}
virtual ~ByteTriePackageLookup() {}
virtual void call(UErrorCode *pErrorCode) {
int32_t count=pkg.getItemCount();
const char *nameTrieBytes=builder.build(UDICTTRIE_BUILD_SMALL, *pErrorCode).data();
const char *name=itemNames.data();
for(int32_t i=0; i<count; ++i) {
if(byteTrieLookup(name, nameTrieBytes)<0) {
fprintf(stderr, "item not found: %s\n", name);
}
name=strchr(name, 0)+1;
}
}
protected:
ByteTrieBuilder builder;
CharString itemNames;
};
// Performance test function object.
// Each subclass loads a dictionary text file
// from the -s or --sourcedir path plus -f or --file-name.
// For example, <ICU source dir>/source/data/brkitr/thaidict.txt.
class DictLookup : public UPerfFunction {
public:
DictLookup(const DictionaryTriePerfTest &perfTest) : perf(perfTest) {}
virtual long getOperationsPerIteration() {
return perf.numTextLines;
}
protected:
const DictionaryTriePerfTest &perf;
};
class CompactTrieDictLookup : public DictLookup {
public:
CompactTrieDictLookup(const DictionaryTriePerfTest &perfTest)
: DictLookup(perfTest), ctd(NULL) {
IcuToolErrorCode errorCode("UCharTrieDictLookup()");
// U+0E1C is the median code unit, from
// the UCharTrie root node (split-branch node) for thaidict.txt.
MutableTrieDictionary builder(0xe1c, errorCode);
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
builder.addWord(lines[i].name, lines[i].len, errorCode);
}
ctd=new CompactTrieDictionary(builder, errorCode);
int32_t length=(int32_t)ctd->dataSize();
printf("size of CompactTrieDict: %6ld bytes\n", (long)length);
}
virtual ~CompactTrieDictLookup() {
delete ctd;
}
virtual void call(UErrorCode *pErrorCode) {
UText text=UTEXT_INITIALIZER;
int32_t lengths[20];
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
int32_t count;
ctd->matches(&text, lines[i].len,
lengths, count, LENGTHOF(lengths));
if(count==0 || lengths[count-1]!=lines[i].len) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
}
}
}
protected:
CompactTrieDictionary *ctd;
};
// Closely imitate CompactTrieDictionary::matches().
// Note: CompactTrieDictionary::matches() is part of its trie implementation,
// and while it loops over the text, it knows the current state.
// By contrast, this implementation uses UCharTrie API functions that have to
// check the trie state each time and load/store state in the object.
// (Whether it hasNext() and whether it is in the middle of a linear-match node.)
static int32_t
ucharTrieMatches(UCharTrie &trie,
UText *text, int32_t textLimit,
int32_t *lengths, int &count, int limit ) {
UChar32 c=utext_next32(text);
// Notes:
// a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
// b) It also ignores non-BMP code points by casting to UChar!
if(c<0) {
return 0;
}
// Should be firstForCodePoint() but CompactTrieDictionary
// handles only code units.
UDictTrieResult result=trie.first(c);
int32_t numChars=1;
count=0;
for(;;) {
if(UDICTTRIE_RESULT_HAS_VALUE(result)) {
if(count<limit) {
// lengths[count++]=(int32_t)utext_getNativeIndex(text);
lengths[count++]=numChars; // CompactTrieDictionary just counts chars too.
}
if(result==UDICTTRIE_HAS_FINAL_VALUE) {
break;
}
} else if(result==UDICTTRIE_NO_MATCH) {
break;
}
if(numChars>=textLimit) {
// Note: Why do we have both a text limit and a UText that knows its length?
break;
}
UChar32 c=utext_next32(text);
// Notes:
// a) CompactTrieDictionary::matches() does not check for U_SENTINEL.
// b) It also ignores non-BMP code points by casting to UChar!
if(c<0) {
break;
}
++numChars;
// Should be nextForCodePoint() but CompactTrieDictionary
// handles only code units.
result=trie.next(c);
}
#if 0
// Note: CompactTrieDictionary::matches() comments say that it leaves the UText
// after the longest prefix match and returns the number of characters
// that were matched.
if(index!=lastMatch) {
utext_setNativeIndex(text, lastMatch);
}
return lastMatch-start;
// However, it does not do either of these, so I am not trying to
// imitate it (or its docs) 100%.
#endif
return numChars;
}
class UCharTrieDictLookup : public DictLookup {
public:
UCharTrieDictLookup(const DictionaryTriePerfTest &perfTest)
: DictLookup(perfTest) {
IcuToolErrorCode errorCode("UCharTrieDictLookup()");
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
builder.add(UnicodeString(FALSE, lines[i].name, lines[i].len), 0, errorCode);
}
UnicodeString trieUChars;
int32_t length=builder.build(UDICTTRIE_BUILD_SMALL, trieUChars, errorCode).length();
printf("size of UCharTrie: %6ld bytes\n", (long)length*2);
}
virtual ~UCharTrieDictLookup() {}
protected:
UCharTrieBuilder builder;
};
class UCharTrieDictMatches : public UCharTrieDictLookup {
public:
UCharTrieDictMatches(const DictionaryTriePerfTest &perfTest)
: UCharTrieDictLookup(perfTest) {}
virtual void call(UErrorCode *pErrorCode) {
UnicodeString uchars;
UCharTrie trie(builder.build(UDICTTRIE_BUILD_SMALL, uchars, *pErrorCode).getBuffer());
UText text=UTEXT_INITIALIZER;
int32_t lengths[20];
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
int32_t count=0;
ucharTrieMatches(trie, &text, lines[i].len,
lengths, count, LENGTHOF(lengths));
if(count==0 || lengths[count-1]!=lines[i].len) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
}
}
}
};
class UCharTrieDictContains : public UCharTrieDictLookup {
public:
UCharTrieDictContains(const DictionaryTriePerfTest &perfTest)
: UCharTrieDictLookup(perfTest) {}
virtual void call(UErrorCode *pErrorCode) {
UnicodeString uchars;
UCharTrie trie(builder.build(UDICTTRIE_BUILD_SMALL, uchars, *pErrorCode).getBuffer());
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
if(!UDICTTRIE_RESULT_HAS_VALUE(trie.reset().next(lines[i].name, lines[i].len))) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
}
}
}
};
static inline int32_t thaiCharToByte(UChar32 c) {
if(0xe00<=c && c<=0xefe) {
return c&0xff;
} else if(c==0x2e) {
return 0xff;
} else {
return -1;
}
}
static UBool thaiWordToBytes(const UChar *s, int32_t length,
CharString &str, UErrorCode &errorCode) {
for(int32_t i=0; i<length; ++i) {
UChar c=s[i];
int32_t b=thaiCharToByte(c);
if(b>=0) {
str.append((char)b, errorCode);
} else {
fprintf(stderr, "thaiWordToBytes(): unable to encode U+%04X as a byte\n", c);
return FALSE;
}
}
return TRUE;
}
class ByteTrieDictLookup : public DictLookup {
public:
ByteTrieDictLookup(const DictionaryTriePerfTest &perfTest)
: DictLookup(perfTest), noDict(FALSE) {
IcuToolErrorCode errorCode("ByteTrieDictLookup()");
CharString str;
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
if(!thaiWordToBytes(lines[i].name, lines[i].len, str.clear(), errorCode)) {
fprintf(stderr, "thaiWordToBytes(): failed for word %ld (0-based)\n", (long)i);
noDict=TRUE;
break;
}
builder.add(str.toStringPiece(), 0, errorCode);
}
if(!noDict) {
int32_t length=builder.build(UDICTTRIE_BUILD_SMALL, errorCode).length();
printf("size of ByteTrie: %6ld bytes\n", (long)length);
}
}
virtual ~ByteTrieDictLookup() {}
protected:
ByteTrieBuilder builder;
UBool noDict;
};
static int32_t
byteTrieMatches(ByteTrie &trie,
UText *text, int32_t textLimit,
int32_t *lengths, int &count, int limit ) {
UChar32 c=utext_next32(text);
if(c<0) {
return 0;
}
UDictTrieResult result=trie.first(thaiCharToByte(c));
int32_t numChars=1;
count=0;
for(;;) {
if(UDICTTRIE_RESULT_HAS_VALUE(result)) {
if(count<limit) {
// lengths[count++]=(int32_t)utext_getNativeIndex(text);
lengths[count++]=numChars; // CompactTrieDictionary just counts chars too.
}
if(result==UDICTTRIE_HAS_FINAL_VALUE) {
break;
}
} else if(result==UDICTTRIE_NO_MATCH) {
break;
}
if(numChars>=textLimit) {
break;
}
UChar32 c=utext_next32(text);
if(c<0) {
break;
}
++numChars;
result=trie.next(thaiCharToByte(c));
}
return numChars;
}
class ByteTrieDictMatches : public ByteTrieDictLookup {
public:
ByteTrieDictMatches(const DictionaryTriePerfTest &perfTest)
: ByteTrieDictLookup(perfTest) {}
virtual void call(UErrorCode *pErrorCode) {
if(noDict) {
return;
}
ByteTrie trie(builder.build(UDICTTRIE_BUILD_SMALL, *pErrorCode).data());
UText text=UTEXT_INITIALIZER;
int32_t lengths[20];
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
// Skip comment lines (start with a character below 'A').
if(lines[i].name[0]<0x41) {
continue;
}
utext_openUChars(&text, lines[i].name, lines[i].len, pErrorCode);
int32_t count=0;
byteTrieMatches(trie, &text, lines[i].len,
lengths, count, LENGTHOF(lengths));
if(count==0 || lengths[count-1]!=lines[i].len) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
}
}
}
};
class ByteTrieDictContains : public ByteTrieDictLookup {
public:
ByteTrieDictContains(const DictionaryTriePerfTest &perfTest)
: ByteTrieDictLookup(perfTest) {}
virtual void call(UErrorCode *pErrorCode) {
if(noDict) {
return;
}
ByteTrie trie(builder.build(UDICTTRIE_BUILD_SMALL, *pErrorCode).data());
const ULine *lines=perf.getCachedLines();
int32_t numLines=perf.getNumLines();
for(int32_t i=0; i<numLines; ++i) {
const UChar *line=lines[i].name;
// Skip comment lines (start with a character below 'A').
if(line[0]<0x41) {
continue;
}
UDictTrieResult result=trie.first(thaiCharToByte(line[0]));
int32_t lineLength=lines[i].len;
for(int32_t j=1; j<lineLength; ++j) {
if(!UDICTTRIE_RESULT_HAS_NEXT(result)) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
break;
}
result=trie.next(thaiCharToByte(line[j]));
}
if(!UDICTTRIE_RESULT_HAS_VALUE(result)) {
fprintf(stderr, "word %ld (0-based) not found\n", (long)i);
}
}
}
};
UPerfFunction *DictionaryTriePerfTest::runIndexedTest(int32_t index, UBool exec,
const char *&name, char * /*par*/) {
if(hasFile()) {
switch(index) {
case 0:
name="compacttriematches";
if(exec) {
return new CompactTrieDictLookup(*this);
}
break;
case 1:
name="uchartriematches";
if(exec) {
return new UCharTrieDictMatches(*this);
}
break;
case 2:
name="uchartriecontains";
if(exec) {
return new UCharTrieDictContains(*this);
}
break;
case 3:
name="bytetriematches";
if(exec) {
return new ByteTrieDictMatches(*this);
}
break;
case 4:
name="bytetriecontains";
if(exec) {
return new ByteTrieDictContains(*this);
}
break;
default:
name="";
break;
}
} else {
if(index==0 && exec) {
puts("Running ByteTrie perf tests on the .dat package file from the --sourcedir.\n"
"For UCharTrie perf tests on a dictionary text file, specify the -f or --file-name.\n");
}
switch(index) {
case 0:
name="simplebinarysearch";
if(exec) {
return new BinarySearchPackageLookup(*this);
}
break;
case 1:
name="prefixbinarysearch";
if(exec) {
return new PrefixBinarySearchPackageLookup(*this);
}
break;
case 2:
name="bytetrie";
if(exec) {
return new ByteTriePackageLookup(*this);
}
break;
default:
name="";
break;
}
}
return NULL;
}
int main(int argc, const char *argv[]) {
IcuToolErrorCode errorCode("dicttrieperf main()");
DictionaryTriePerfTest test(argc, argv, errorCode);
if(errorCode.isFailure()) {
fprintf(stderr, "DictionaryTriePerfTest() failed: %s\n", errorCode.errorName());
test.usage();
return errorCode.reset();
}
if(!test.run()) {
fprintf(stderr, "FAILED: Tests could not be run, please check the arguments.\n");
return -1;
}
return 0;
}

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2002-2009, International Business Machines Corporation and
* Copyright (c) 2002-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -203,6 +203,12 @@ void UPerfTest::init(UOption addOptions[], int32_t addOptionsCount,
}
ULine* UPerfTest::getLines(UErrorCode& status){
if (U_FAILURE(status)) {
return NULL;
}
if (lines != NULL) {
return lines; // don't do it again
}
lines = new ULine[MAXLINES];
int maxLines = MAXLINES;
numLines=0;

View file

@ -52,6 +52,9 @@ LDFLAGS += $(LDFLAGSICUTOOLUTIL)
LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS)
OBJECTS = filestrm.o package.o pkgitems.o swapimpl.o toolutil.o unewdata.o \
dicttriebuilder.o bytetriebuilder.o bytetrieiterator.o \
uchartrie.o uchartriebuilder.o uchartrieiterator.o \
denseranges.o \
ucm.o ucmstate.o uoptions.o uparse.o \
ucbuf.o xmlparser.o writesrc.o \
pkg_icu.o pkg_genc.o pkg_gencmn.o flagparser.o filetools.o \

View file

@ -0,0 +1,755 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetriebuilder.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*
* Builder class for ByteTrie dictionary trie.
*/
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "bytetrie.h"
#include "bytetriebuilder.h"
#include "charstr.h"
#include "cmemory.h"
#include "uarrsort.h"
U_NAMESPACE_BEGIN
/*
* Note: This builder implementation stores (bytes, value) pairs with full copies
* of the byte sequences, until the ByteTrie is built.
* It might(!) take less memory if we collected the data in a temporary, dynamic trie.
*/
class ByteTrieElement : public UMemory {
public:
// Use compiler's default constructor, initializes nothing.
void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
StringPiece getString(const CharString &strings) const {
int32_t offset=stringOffset;
int32_t length;
if(offset>=0) {
length=(uint8_t)strings[offset++];
} else {
offset=~offset;
length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
offset+=2;
}
return StringPiece(strings.data()+offset, length);
}
int32_t getStringLength(const CharString &strings) const {
int32_t offset=stringOffset;
if(offset>=0) {
return (uint8_t)strings[offset];
} else {
offset=~offset;
return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
}
}
char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
int32_t getValue() const { return value; }
int32_t compareStringTo(const ByteTrieElement &o, const CharString &strings) const;
private:
const char *data(const CharString &strings) const {
int32_t offset=stringOffset;
if(offset>=0) {
++offset;
} else {
offset=~offset+2;
}
return strings.data()+offset;
}
// If the stringOffset is non-negative, then the first strings byte contains
// the string length.
// If the stringOffset is negative, then the first two strings bytes contain
// the string length (big-endian), and the offset needs to be bit-inverted.
// (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
int32_t stringOffset;
int32_t value;
};
void
ByteTrieElement::setTo(const StringPiece &s, int32_t val,
CharString &strings, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
int32_t length=s.length();
if(length>0xffff) {
// Too long: We store the length in 1 or 2 bytes.
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
int32_t offset=strings.length();
if(length>0xff) {
offset=~offset;
strings.append((char)(length>>8), errorCode);
}
strings.append((char)length, errorCode);
stringOffset=offset;
value=val;
strings.append(s, errorCode);
}
int32_t
ByteTrieElement::compareStringTo(const ByteTrieElement &other, const CharString &strings) const {
// TODO: add StringPiece::compare(), see ticket #8187
StringPiece thisString=getString(strings);
StringPiece otherString=other.getString(strings);
int32_t lengthDiff=thisString.length()-otherString.length();
int32_t commonLength;
if(lengthDiff<=0) {
commonLength=thisString.length();
} else {
commonLength=otherString.length();
}
int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
return diff!=0 ? diff : lengthDiff;
}
ByteTrieBuilder::~ByteTrieBuilder() {
delete[] elements;
uprv_free(bytes);
}
ByteTrieBuilder &
ByteTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return *this;
}
if(bytesLength>0) {
// Cannot add elements after building.
errorCode=U_NO_WRITE_PERMISSION;
return *this;
}
bytesCapacity+=s.length()+1; // Crude bytes preallocation estimate.
if(elementsLength==elementsCapacity) {
int32_t newCapacity;
if(elementsCapacity==0) {
newCapacity=1024;
} else {
newCapacity=4*elementsCapacity;
}
ByteTrieElement *newElements=new ByteTrieElement[newCapacity];
if(newElements==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
if(elementsLength>0) {
uprv_memcpy(newElements, elements, elementsLength*sizeof(ByteTrieElement));
}
delete[] elements;
elements=newElements;
elementsCapacity=newCapacity;
}
elements[elementsLength++].setTo(s, value, strings, errorCode);
return *this;
}
U_CDECL_BEGIN
static int32_t U_CALLCONV
compareElementStrings(const void *context, const void *left, const void *right) {
const CharString *strings=reinterpret_cast<const CharString *>(context);
const ByteTrieElement *leftElement=reinterpret_cast<const ByteTrieElement *>(left);
const ByteTrieElement *rightElement=reinterpret_cast<const ByteTrieElement *>(right);
return leftElement->compareStringTo(*rightElement, *strings);
}
U_CDECL_END
StringPiece
ByteTrieBuilder::build(UDictTrieBuildOption buildOption, UErrorCode &errorCode) {
StringPiece result;
if(U_FAILURE(errorCode)) {
return result;
}
if(bytesLength>0) {
// Already built.
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
return result;
}
if(elementsLength==0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return result;
}
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(ByteTrieElement),
compareElementStrings, &strings,
FALSE, // need not be a stable sort
&errorCode);
if(U_FAILURE(errorCode)) {
return result;
}
// Duplicate strings are not allowed.
StringPiece prev=elements[0].getString(strings);
for(int32_t i=1; i<elementsLength; ++i) {
StringPiece current=elements[i].getString(strings);
if(prev==current) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return result;
}
prev=current;
}
// Create and byte-serialize the trie for the elements.
if(bytesCapacity<1024) {
bytesCapacity=1024;
}
bytes=reinterpret_cast<char *>(uprv_malloc(bytesCapacity));
if(bytes==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return result;
}
if(buildOption==UDICTTRIE_BUILD_FAST) {
writeNode(0, elementsLength, 0);
} else /* UDICTTRIE_BUILD_SMALL */ {
createCompactBuilder(2*elementsLength, errorCode);
Node *root=makeNode(0, elementsLength, 0, errorCode);
if(U_SUCCESS(errorCode)) {
root->markRightEdgesFirst(-1);
root->write(*this);
}
deleteCompactBuilder();
}
if(bytes==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
}
return result;
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length byteIndex.
void
ByteTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t byteIndex) {
UBool hasValue=FALSE;
int32_t value=0;
if(byteIndex==elements[start].getStringLength(strings)) {
// An intermediate or final value.
value=elements[start++].getValue();
if(start==limit) {
writeValueAndFinal(value, TRUE); // final-value node
return;
}
hasValue=TRUE;
}
// Now all [start..limit[ strings are longer than byteIndex.
const ByteTrieElement &minElement=elements[start];
const ByteTrieElement &maxElement=elements[limit-1];
int32_t minByte=(uint8_t)minElement.charAt(byteIndex, strings);
int32_t maxByte=(uint8_t)maxElement.charAt(byteIndex, strings);
if(minByte==maxByte) {
// Linear-match node: All strings have the same character at byteIndex.
int32_t minStringLength=minElement.getStringLength(strings);
int32_t lastByteIndex=byteIndex;
while(++lastByteIndex<minStringLength &&
minElement.charAt(lastByteIndex, strings)==
maxElement.charAt(lastByteIndex, strings)) {}
writeNode(start, limit, lastByteIndex);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
const char *s=minElement.getString(strings).data();
int32_t length=lastByteIndex-byteIndex;
while(length>ByteTrie::kMaxLinearMatchLength) {
lastByteIndex-=ByteTrie::kMaxLinearMatchLength;
length-=ByteTrie::kMaxLinearMatchLength;
write(s+lastByteIndex, ByteTrie::kMaxLinearMatchLength);
write(ByteTrie::kMinLinearMatch+ByteTrie::kMaxLinearMatchLength-1);
}
write(s+byteIndex, length);
write(ByteTrie::kMinLinearMatch+length-1);
} else {
// Branch node.
int32_t length=0; // Number of different bytes at byteIndex.
int32_t i=start;
do {
char byte=elements[i++].charAt(byteIndex, strings);
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
++length;
} while(i<limit);
// length>=2 because minByte!=maxByte.
writeBranchSubNode(start, limit, byteIndex, length);
write(--length);
if(length>=ByteTrie::kMinLinearMatch) {
write(0);
}
}
if(hasValue) {
writeValueAndFinal(value, FALSE);
}
}
// start<limit && all strings longer than byteIndex &&
// length different bytes at byteIndex
void
ByteTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length) {
char middleBytes[16];
int32_t lessThan[16];
int32_t ltLength=0;
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
// Branch on the middle byte.
// First, find the middle byte.
int32_t count=length/2;
int32_t i=start;
char byte;
do {
byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
} while(--count>0);
// Encode the less-than branch first.
byte=middleBytes[ltLength]=elements[i].charAt(byteIndex, strings); // middle byte
writeBranchSubNode(start, i, byteIndex, length/2);
lessThan[ltLength]=bytesLength;
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
// For each byte, find its elements array start and whether it has a final value.
int32_t starts[ByteTrie::kMaxBranchLinearSubNodeLength];
UBool final[ByteTrie::kMaxBranchLinearSubNodeLength-1];
int32_t byteNumber=0;
do {
int32_t i=starts[byteNumber]=start;
char byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
final[byteNumber]= start==i-1 && byteIndex+1==elements[start].getStringLength(strings);
start=i;
} while(++byteNumber<length-1);
// byteNumber==length-1, and the maxByte elements range is [start..limit[
starts[byteNumber]=start;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minByte sub-node first,
// then its jump delta would be larger.
// Instead we write the minByte sub-node last, for a shorter delta.
int32_t jumpTargets[ByteTrie::kMaxBranchLinearSubNodeLength-1];
do {
--byteNumber;
if(!final[byteNumber]) {
writeNode(starts[byteNumber], starts[byteNumber+1], byteIndex+1);
jumpTargets[byteNumber]=bytesLength;
}
} while(byteNumber>0);
// The maxByte sub-node is written as the very last one because we do
// not jump for it at all.
byteNumber=length-1;
writeNode(start, limit, byteIndex+1);
write((uint8_t)elements[start].charAt(byteIndex, strings));
// Write the rest of this node's byte-value pairs.
while(--byteNumber>=0) {
start=starts[byteNumber];
int32_t value;
if(final[byteNumber]) {
// Write the final value for the one string ending with this byte.
value=elements[start].getValue();
} else {
// Write the delta to the start position of the sub-node.
value=bytesLength-jumpTargets[byteNumber];
}
writeValueAndFinal(value, final[byteNumber]);
write((uint8_t)elements[start].charAt(byteIndex, strings));
}
// Write the split-branch nodes.
while(ltLength>0) {
--ltLength;
writeDelta(bytesLength-lessThan[ltLength]); // less-than
write((uint8_t)middleBytes[ltLength]);
}
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length byteIndex.
DictTrieBuilder::Node *
ByteTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t byteIndex, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UBool hasValue=FALSE;
int32_t value=0;
if(byteIndex==elements[start].getStringLength(strings)) {
// An intermediate or final value.
value=elements[start++].getValue();
if(start==limit) {
return registerFinalValue(value, errorCode);
}
hasValue=TRUE;
}
Node *node;
// Now all [start..limit[ strings are longer than byteIndex.
const ByteTrieElement &minElement=elements[start];
const ByteTrieElement &maxElement=elements[limit-1];
int32_t minByte=(uint8_t)minElement.charAt(byteIndex, strings);
int32_t maxByte=(uint8_t)maxElement.charAt(byteIndex, strings);
if(minByte==maxByte) {
// Linear-match node: All strings have the same character at byteIndex.
int32_t minStringLength=minElement.getStringLength(strings);
int32_t lastByteIndex=byteIndex;
while(++lastByteIndex<minStringLength &&
minElement.charAt(lastByteIndex, strings)==
maxElement.charAt(lastByteIndex, strings)) {}
Node *nextNode=makeNode(start, limit, lastByteIndex, errorCode);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
const char *s=minElement.getString(strings).data();
int32_t length=lastByteIndex-byteIndex;
while(length>ByteTrie::kMaxLinearMatchLength) {
lastByteIndex-=ByteTrie::kMaxLinearMatchLength;
length-=ByteTrie::kMaxLinearMatchLength;
node=new BTLinearMatchNode(
s+lastByteIndex,
ByteTrie::kMaxLinearMatchLength,
nextNode);
node=registerNode(node, errorCode);
nextNode=node;
}
node=new BTLinearMatchNode(s+byteIndex, length, nextNode);
} else {
// Branch node.
int32_t length=0; // Number of different bytes at byteIndex.
int32_t i=start;
do {
char byte=elements[i++].charAt(byteIndex, strings);
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
++length;
} while(i<limit);
// length>=2 because minByte!=maxByte.
Node *subNode=makeBranchSubNode(start, limit, byteIndex, length, errorCode);
node=new BTBranchHeadNode(length, subNode);
}
node=registerNode(node, errorCode);
if(hasValue) {
node=registerNode(new BTValueNode(value, node), errorCode);
}
return node;
}
// start<limit && all strings longer than byteIndex &&
// length different bytes at byteIndex
DictTrieBuilder::Node *
ByteTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex,
int32_t length, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
char middleBytes[16];
Node *lessThan[16];
int32_t ltLength=0;
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
// Branch on the middle byte.
// First, find the middle byte.
int32_t count=length/2;
int32_t i=start;
char byte;
do {
byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
} while(--count>0);
// Encode the less-than branch first.
byte=middleBytes[ltLength]=elements[i].charAt(byteIndex, strings); // middle byte
lessThan[ltLength]=makeBranchSubNode(start, i, byteIndex, length/2, errorCode);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
if(U_FAILURE(errorCode)) {
return NULL;
}
BTListBranchNode *listNode=new BTListBranchNode();
if(listNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// For each byte, find its elements array start and whether it has a final value.
int32_t byteNumber=0;
do {
int32_t i=start;
char byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
if(start==i-1 && byteIndex+1==elements[start].getStringLength(strings)) {
listNode->add((uint8_t)byte, elements[start].getValue());
} else {
listNode->add((uint8_t)byte, makeNode(start, i, byteIndex+1, errorCode));
}
start=i;
} while(++byteNumber<length-1);
// byteNumber==length-1, and the maxByte elements range is [start..limit[
char byte=elements[start].charAt(byteIndex, strings);
if(start==limit-1 && byteIndex+1==elements[start].getStringLength(strings)) {
listNode->add((uint8_t)byte, elements[start].getValue());
} else {
listNode->add((uint8_t)byte, makeNode(start, limit, byteIndex+1, errorCode));
}
Node *node=registerNode(listNode, errorCode);
// Create the split-branch nodes.
while(ltLength>0) {
--ltLength;
node=registerNode(
new BTSplitBranchNode(middleBytes[ltLength], lessThan[ltLength], node), errorCode);
}
return node;
}
void
ByteTrieBuilder::BTFinalValueNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
offset=b.writeValueAndFinal(value, TRUE);
}
UBool
ByteTrieBuilder::BTValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const BTValueNode &o=(const BTValueNode &)other;
return next==o.next;
}
int32_t
ByteTrieBuilder::BTValueNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
void
ByteTrieBuilder::BTValueNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
next->write(builder);
offset=b.writeValueAndFinal(value, FALSE);
}
ByteTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
: LinearMatchNode(len, nextNode), s(bytes) {
hash=hash*37+uhash_hashCharsN(bytes, len);
}
UBool
ByteTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!LinearMatchNode::operator==(other)) {
return FALSE;
}
const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
return 0==uprv_memcmp(s, o.s, length);
}
void
ByteTrieBuilder::BTLinearMatchNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
next->write(builder);
b.write(s, length);
offset=b.write(ByteTrie::kMinLinearMatch+length-1);
}
void
ByteTrieBuilder::BTListBranchNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minByte sub-node first,
// then its jump delta would be larger.
// Instead we write the minByte sub-node last, for a shorter delta.
int32_t byteNumber=length-1;
Node *rightEdge=equal[byteNumber];
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
do {
--byteNumber;
if(equal[byteNumber]!=NULL) {
equal[byteNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
}
} while(byteNumber>0);
// The maxByte sub-node is written as the very last one because we do
// not jump for it at all.
byteNumber=length-1;
if(rightEdge==NULL) {
b.writeValueAndFinal(values[byteNumber], TRUE);
} else {
rightEdge->write(builder);
}
b.write(units[byteNumber]);
// Write the rest of this node's byte-value pairs.
while(--byteNumber>=0) {
int32_t value;
UBool isFinal;
if(equal[byteNumber]==NULL) {
// Write the final value for the one string ending with this byte.
value=values[byteNumber];
isFinal=TRUE;
} else {
// Write the delta to the start position of the sub-node.
U_ASSERT(equal[byteNumber]->getOffset()>0);
value=b.bytesLength-equal[byteNumber]->getOffset();
isFinal=FALSE;
}
b.writeValueAndFinal(value, isFinal);
offset=b.write(units[byteNumber]);
}
}
void
ByteTrieBuilder::BTSplitBranchNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
// Encode the less-than branch first.
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
// Encode the greater-or-equal branch last because we do not jump for it at all.
greaterOrEqual->write(builder);
// Write this node.
U_ASSERT(lessThan->getOffset()>0);
b.writeDelta(b.bytesLength-lessThan->getOffset()); // less-than
offset=b.write(unit);
}
void
ByteTrieBuilder::BTBranchHeadNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
next->write(builder);
offset=b.write((length-1));
if(length>ByteTrie::kMinLinearMatch) {
offset=b.write(0);
}
}
UBool
ByteTrieBuilder::ensureCapacity(int32_t length) {
if(bytes==NULL) {
return FALSE; // previous memory allocation had failed
}
if(length>bytesCapacity) {
int32_t newCapacity=bytesCapacity;
do {
newCapacity*=2;
} while(newCapacity<=length);
char *newBytes=reinterpret_cast<char *>(uprv_malloc(newCapacity));
if(newBytes==NULL) {
// unable to allocate memory
uprv_free(bytes);
bytes=NULL;
return FALSE;
}
uprv_memcpy(newBytes+(newCapacity-bytesLength),
bytes+(bytesCapacity-bytesLength), bytesLength);
uprv_free(bytes);
bytes=newBytes;
bytesCapacity=newCapacity;
}
return TRUE;
}
int32_t
ByteTrieBuilder::write(int32_t byte) {
int32_t newLength=bytesLength+1;
if(ensureCapacity(newLength)) {
bytesLength=newLength;
bytes[bytesCapacity-bytesLength]=(char)byte;
}
return bytesLength;
}
int32_t
ByteTrieBuilder::write(const char *b, int32_t length) {
int32_t newLength=bytesLength+length;
if(ensureCapacity(newLength)) {
bytesLength=newLength;
uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
}
return bytesLength;
}
int32_t
ByteTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
char intBytes[5];
int32_t length=1;
if(i<0 || i>0xffffff) {
intBytes[0]=(char)ByteTrie::kFiveByteValueLead;
intBytes[1]=(char)(i>>24);
intBytes[2]=(char)(i>>16);
intBytes[3]=(char)(i>>8);
intBytes[4]=(char)i;
length=5;
} else if(i<=ByteTrie::kMaxOneByteValue) {
intBytes[0]=(char)(ByteTrie::kMinOneByteValueLead+i);
} else {
if(i<=ByteTrie::kMaxTwoByteValue) {
intBytes[0]=(char)(ByteTrie::kMinTwoByteValueLead+(i>>8));
} else {
if(i<=ByteTrie::kMaxThreeByteValue) {
intBytes[0]=(char)(ByteTrie::kMinThreeByteValueLead+(i>>16));
} else {
intBytes[0]=(char)ByteTrie::kFourByteValueLead;
intBytes[1]=(char)(i>>16);
length=2;
}
intBytes[length++]=(char)(i>>8);
}
intBytes[length++]=(char)i;
}
intBytes[0]=(char)((intBytes[0]<<1)|final);
return write(intBytes, length);
}
int32_t
ByteTrieBuilder::writeDelta(int32_t i) {
char intBytes[5];
int32_t length;
U_ASSERT(i>=0);
if(i<=ByteTrie::kMaxOneByteDelta) {
length=0;
} else if(i<=ByteTrie::kMaxTwoByteDelta) {
intBytes[0]=(char)(ByteTrie::kMinTwoByteDeltaLead+(i>>8));
length=1;
} else {
if(i<=ByteTrie::kMaxThreeByteDelta) {
intBytes[0]=(char)(ByteTrie::kMinThreeByteDeltaLead+(i>>16));
length=2;
} else {
if(i<=0xffffff) {
intBytes[0]=(char)ByteTrie::kFourByteDeltaLead;
length=3;
} else {
intBytes[0]=(char)ByteTrie::kFiveByteDeltaLead;
intBytes[1]=(char)(i>>24);
length=4;
}
intBytes[1]=(char)(i>>16);
}
intBytes[1]=(char)(i>>8);
}
intBytes[length++]=(char)i;
return write(intBytes, length);
}
U_NAMESPACE_END

View file

@ -0,0 +1,123 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetriebuilder.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*
* Builder class for ByteTrie dictionary trie.
*/
#ifndef __BYTETRIEBUILDER_H__
#define __BYTETRIEBUILDER_H__
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "charstr.h"
#include "dicttriebuilder.h"
U_NAMESPACE_BEGIN
class ByteTrieElement;
class U_TOOLUTIL_API ByteTrieBuilder : public DictTrieBuilder {
public:
ByteTrieBuilder()
: elements(NULL), elementsCapacity(0), elementsLength(0),
bytes(NULL), bytesCapacity(0), bytesLength(0) {}
~ByteTrieBuilder();
ByteTrieBuilder &add(const StringPiece &s, int32_t value, UErrorCode &errorCode);
StringPiece build(UDictTrieBuildOption buildOption, UErrorCode &errorCode);
ByteTrieBuilder &clear() {
strings.clear();
elementsLength=0;
bytesLength=0;
return *this;
}
private:
void writeNode(int32_t start, int32_t limit, int32_t byteIndex);
void writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length);
Node *makeNode(int32_t start, int32_t limit, int32_t byteIndex, UErrorCode &errorCode);
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex,
int32_t length, UErrorCode &errorCode);
UBool ensureCapacity(int32_t length);
int32_t write(int32_t byte);
int32_t write(const char *b, int32_t length);
int32_t writeValueAndFinal(int32_t i, UBool final);
int32_t writeDelta(int32_t i);
// Compacting builder.
class BTFinalValueNode : public FinalValueNode {
public:
BTFinalValueNode(int32_t v) : FinalValueNode(v) {}
virtual void write(DictTrieBuilder &builder);
};
class BTValueNode : public ValueNode {
public:
BTValueNode(int32_t v, Node *nextNode)
: ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
virtual void write(DictTrieBuilder &builder);
private:
Node *next;
};
class BTLinearMatchNode : public LinearMatchNode {
public:
BTLinearMatchNode(const char *units, int32_t len, Node *nextNode);
virtual UBool operator==(const Node &other) const;
virtual void write(DictTrieBuilder &builder);
private:
const char *s;
};
class BTListBranchNode : public ListBranchNode {
public:
BTListBranchNode() : ListBranchNode() {}
virtual void write(DictTrieBuilder &builder);
};
class BTSplitBranchNode : public SplitBranchNode {
public:
BTSplitBranchNode(char middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
: SplitBranchNode((uint8_t)middleUnit, lessThanNode, greaterOrEqualNode) {}
virtual void write(DictTrieBuilder &builder);
};
class BTBranchHeadNode : public BranchHeadNode {
public:
BTBranchHeadNode(int32_t len, Node *subNode) : BranchHeadNode(len, subNode) {}
virtual void write(DictTrieBuilder &builder);
};
virtual Node *createFinalValueNode(int32_t value) const { return new BTFinalValueNode(value); }
CharString strings;
ByteTrieElement *elements;
int32_t elementsCapacity;
int32_t elementsLength;
// Byte serialization of the trie.
// Grows from the back: bytesLength measures from the end of the buffer!
char *bytes;
int32_t bytesCapacity;
int32_t bytesLength;
};
U_NAMESPACE_END
#endif // __BYTETRIEBUILDER_H__

View file

@ -0,0 +1,167 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetrieiterator.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov03
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "bytetrie.h"
#include "bytetrieiterator.h"
#include "charstr.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
ByteTrieIterator::ByteTrieIterator(const void *trieBytes, int32_t maxStringLength,
UErrorCode &errorCode)
: bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
pos_(bytes_), initialPos_(bytes_),
remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
maxLength_(maxStringLength), value_(0), stack_(errorCode) {}
ByteTrieIterator::ByteTrieIterator(const ByteTrie &trie, int32_t maxStringLength,
UErrorCode &errorCode)
: bytes_(trie.bytes_), pos_(trie.pos_), initialPos_(trie.pos_),
remainingMatchLength_(trie.remainingMatchLength_),
initialRemainingMatchLength_(trie.remainingMatchLength_),
maxLength_(maxStringLength), value_(0), stack_(errorCode) {
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining bytes to str.
++length;
if(maxLength_>0 && length>maxLength_) {
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
}
str_.append(reinterpret_cast<const char *>(pos_), length, errorCode);
pos_+=length;
remainingMatchLength_-=length;
}
}
ByteTrieIterator &ByteTrieIterator::reset() {
pos_=initialPos_;
remainingMatchLength_=initialRemainingMatchLength_;
int32_t length=remainingMatchLength_+1; // Remaining match length.
if(maxLength_>0 && length>maxLength_) {
length=maxLength_;
}
str_.truncate(length);
pos_+=length;
remainingMatchLength_-=length;
stack_.setSize(0);
return *this;
}
UBool
ByteTrieIterator::next(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return FALSE;
}
const uint8_t *pos=pos_;
if(pos==NULL) {
if(stack_.isEmpty()) {
return FALSE;
}
// Pop the state off the stack and continue with the next outbound edge of
// the branch node.
int32_t stackSize=stack_.size();
int32_t length=stack_.elementAti(stackSize-1);
pos=bytes_+stack_.elementAti(stackSize-2);
stack_.setSize(stackSize-2);
str_.truncate(length&0xffff);
length=(int32_t)((uint32_t)length>>16);
if(length>1) {
pos=branchNext(pos, length, errorCode);
if(pos==NULL) {
return TRUE; // Reached a final value.
}
} else {
str_.append((char)*pos++, errorCode);
}
}
if(remainingMatchLength_>=0) {
// We only get here if we started in a pending linear-match node
// with more than maxLength remaining bytes.
return truncateAndStop();
}
for(;;) {
int32_t node=*pos++;
if(node>=ByteTrie::kMinValueLead) {
// Deliver value for the byte sequence so far.
UBool isFinal=(UBool)(node&ByteTrie::kValueIsFinal);
value_=ByteTrie::readValue(pos, node>>1);
if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
pos_=NULL;
} else {
pos_=ByteTrie::skipValue(pos, node);
}
sp_.set(str_.data(), str_.length());
return TRUE;
}
if(maxLength_>0 && str_.length()==maxLength_) {
return truncateAndStop();
}
if(node<ByteTrie::kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
pos=branchNext(pos, node+1, errorCode);
if(pos==NULL) {
return TRUE; // Reached a final value.
}
} else {
// Linear-match node, append length bytes to str_.
int32_t length=node-ByteTrie::kMinLinearMatch+1;
if(maxLength_>0 && str_.length()+length>maxLength_) {
str_.append(reinterpret_cast<const char *>(pos),
maxLength_-str_.length(), errorCode);
return truncateAndStop();
}
str_.append(reinterpret_cast<const char *>(pos), length, errorCode);
pos+=length;
}
}
}
// Branch node, needs to take the first outbound edge and push state for the rest.
const uint8_t *
ByteTrieIterator::branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode) {
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
// Push state for the greater-or-equal edge.
stack_.addElement((int32_t)(ByteTrie::skipDelta(pos)-bytes_), errorCode);
stack_.addElement(((length-(length>>1))<<16)|str_.length(), errorCode);
// Follow the less-than edge.
length>>=1;
pos=ByteTrie::jumpByDelta(pos);
}
// List of key-value pairs where values are either final values or jump deltas.
// Read the first (key, value) pair.
uint8_t trieByte=*pos++;
int32_t node=*pos++;
UBool isFinal=(UBool)(node&ByteTrie::kValueIsFinal);
int32_t value=ByteTrie::readValue(pos, node>>1);
pos=ByteTrie::skipValue(pos, node);
stack_.addElement((int32_t)(pos-bytes_), errorCode);
stack_.addElement(((length-1)<<16)|str_.length(), errorCode);
str_.append((char)trieByte, errorCode);
if(isFinal) {
pos_=NULL;
sp_.set(str_.data(), str_.length());
value_=value;
return NULL;
} else {
return pos+value;
}
}
U_NAMESPACE_END

View file

@ -0,0 +1,126 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetrieiterator.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov03
* created by: Markus W. Scherer
*/
#ifndef __BYTETRIEITERATOR_H__
#define __BYTETRIEITERATOR_H__
/**
* \file
* \brief C++ API: ByteTrie iterator for all of its (byte sequence, value) pairs.
*/
// Needed if and when we change the .dat package index to a ByteTrie,
// so that icupkg can work with an input package.
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "bytetrie.h"
#include "charstr.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
/**
* Iterator for all of the (byte sequence, value) pairs in a ByteTrie.
*/
class U_TOOLUTIL_API ByteTrieIterator : public UMemory {
public:
/**
* Iterates from the root of a byte-serialized ByteTrie.
* @param trieBytes The trie bytes.
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
ByteTrieIterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Iterates from the current state of the specified ByteTrie.
* @param trie The trie whose state will be copied for iteration.
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
ByteTrieIterator(const ByteTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Resets this iterator to its initial state.
*/
ByteTrieIterator &reset();
/**
* Finds the next (byte sequence, value) pair if there is one.
*
* If the byte sequence is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return TRUE if there is another element.
*/
UBool next(UErrorCode &errorCode);
/**
* @return TRUE if there are more elements.
*/
UBool hasNext() const { return pos_!=NULL || !stack_.isEmpty(); }
/**
* @return the NUL-terminated byte sequence for the last successful next()
*/
const StringPiece &getString() const { return sp_; }
/**
* @return the value for the last successful next()
*/
int32_t getValue() const { return value_; }
private:
UBool truncateAndStop() {
pos_=NULL;
value_=-1; // no real value for str
sp_.set(str_.data(), str_.length());
return TRUE;
}
const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
const uint8_t *bytes_;
const uint8_t *pos_;
const uint8_t *initialPos_;
int32_t remainingMatchLength_;
int32_t initialRemainingMatchLength_;
CharString str_;
StringPiece sp_;
int32_t maxLength_;
int32_t value_;
// The stack stores pairs of integers for backtracking to another
// outbound edge of a branch node.
// The first integer is an offset from ByteTrie.bytes.
// The second integer has the str.length() from before the node in bits 15..0,
// and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
// (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
// but the code looks more confusing that way.)
UVector32 stack_;
};
U_NAMESPACE_END
#endif // __BYTETRIEITERATOR_H__

View file

@ -0,0 +1,158 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: denseranges.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*
* Helper code for finding a small number of dense ranges.
*/
#include "unicode/utypes.h"
#include "denseranges.h"
// Definitions in the anonymous namespace are invisible outside this file.
namespace {
/**
* Collect up to 15 range gaps and sort them by ascending gap size.
*/
class LargestGaps {
public:
LargestGaps(int32_t max) : maxLength(max<=kCapacity ? max : kCapacity), length(0) {}
void add(int32_t gapStart, int64_t gapLength) {
int32_t i=length;
while(i>0 && gapLength>gapLengths[i-1]) {
--i;
}
if(i<maxLength) {
// The new gap is now one of the maxLength largest.
// Insert the new gap, moving up smaller ones of the previous
// length largest.
int32_t j= length<maxLength ? length++ : maxLength-1;
while(j>i) {
gapStarts[j]=gapStarts[j-1];
gapLengths[j]=gapLengths[j-1];
--j;
}
gapStarts[i]=gapStart;
gapLengths[i]=gapLength;
}
}
void truncate(int32_t newLength) {
if(newLength<length) {
length=newLength;
}
}
int32_t count() const { return length; }
int32_t gapStart(int32_t i) const { return gapStarts[i]; }
int64_t gapLength(int32_t i) const { return gapLengths[i]; }
int32_t firstAfter(int32_t value) const {
if(length==0) {
return -1;
}
int32_t minValue=0;
int32_t minIndex=-1;
for(int32_t i=0; i<length; ++i) {
if(value<gapStarts[i] && (minIndex<0 || gapStarts[i]<minValue)) {
minValue=gapStarts[i];
minIndex=i;
}
}
return minIndex;
}
private:
static const int32_t kCapacity=15;
int32_t maxLength;
int32_t length;
int32_t gapStarts[kCapacity];
int64_t gapLengths[kCapacity];
};
} // namespace
/**
* Does it make sense to write 1..capacity ranges?
* Returns 0 if not, otherwise the number of ranges.
* @param values Sorted array of signed-integer values.
* @param length Number of values.
* @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.)
* Should be 0x80..0x100, must be 1..0x100.
* @param ranges Output ranges array.
* @param capacity Maximum number of ranges.
* @return Minimum number of ranges (at most capacity) that have the desired density,
* or 0 if that density cannot be achieved.
*/
U_CAPI int32_t U_EXPORT2
uprv_makeDenseRanges(const int32_t values[], int32_t length,
int32_t density,
int32_t ranges[][2], int32_t capacity) {
if(length<=2) {
return 0;
}
int32_t minValue=values[0];
int32_t maxValue=values[length-1]; // Assume minValue<=maxValue.
// Use int64_t variables for intermediate-value precision and to avoid
// signed-int32_t overflow of maxValue-minValue.
int64_t maxLength=(int64_t)maxValue-(int64_t)minValue+1;
if(length>=(density*maxLength)/0x100) {
// Use one range.
ranges[0][0]=minValue;
ranges[0][1]=maxValue;
return 1;
}
if(length<=4) {
return 0;
}
// See if we can split [minValue, maxValue] into 2..capacity ranges,
// divided by the 1..(capacity-1) largest gaps.
LargestGaps gaps(capacity-1);
int32_t i;
int32_t expectedValue=minValue;
for(i=1; i<length; ++i) {
++expectedValue;
int32_t actualValue=values[i];
if(expectedValue!=actualValue) {
gaps.add(expectedValue, (int64_t)actualValue-(int64_t)expectedValue);
expectedValue=actualValue;
}
}
// We know gaps.count()>=1 because we have fewer values (length) than
// the length of the [minValue..maxValue] range (maxLength).
// (Otherwise we would have returned with the one range above.)
int32_t num;
for(i=0, num=2;; ++i, ++num) {
if(i>=gaps.count()) {
// The values are too sparse for capacity or fewer ranges
// of the requested density.
return 0;
}
maxLength-=gaps.gapLength(i);
if(length>num*2 && length>=(density*maxLength)/0x100) {
break;
}
}
// Use the num ranges with the num-1 largest gaps.
gaps.truncate(num-1);
ranges[0][0]=minValue;
for(i=0; i<=num-2; ++i) {
int32_t gapIndex=gaps.firstAfter(minValue);
int32_t gapStart=gaps.gapStart(gapIndex);
ranges[i][1]=gapStart-1;
ranges[i+1][0]=minValue=(int32_t)(gapStart+gaps.gapLength(gapIndex));
}
ranges[num-1][1]=maxValue;
return num;
}

View file

@ -0,0 +1,39 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: denseranges.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010sep25
* created by: Markus W. Scherer
*
* Helper code for finding a small number of dense ranges.
*/
#ifndef __DENSERANGES_H__
#define __DENSERANGES_H__
#include "unicode/utypes.h"
/**
* Does it make sense to write 1..capacity ranges?
* Returns 0 if not, otherwise the number of ranges.
* @param values Sorted array of signed-integer values.
* @param length Number of values.
* @param density Minimum average range density, in 256th. (0x100=100%=perfectly dense.)
* Should be 0x80..0x100, must be 1..0x100.
* @param ranges Output ranges array.
* @param capacity Maximum number of ranges.
* @return Minimum number of ranges (at most capacity) that have the desired density,
* or 0 if that density cannot be achieved.
*/
U_CAPI int32_t U_EXPORT2
uprv_makeDenseRanges(const int32_t values[], int32_t length,
int32_t density,
int32_t ranges[][2], int32_t capacity);
#endif // __DENSERANGES_H__

View file

@ -0,0 +1,267 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: dicttriebuilder.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec24
* created by: Markus W. Scherer
*
* Base class for dictionary-trie builder classes.
*/
#include <typeinfo> // for 'typeid' to work
#include "unicode/utypes.h"
#include "dicttriebuilder.h"
#include "uassert.h"
#include "uhash.h"
U_CDECL_BEGIN
static int32_t U_CALLCONV
hashDictTrieNode(const UHashTok key) {
return U_NAMESPACE_QUALIFIER DictTrieBuilder::hashNode(key.pointer);
}
static UBool U_CALLCONV
equalDictTrieNodes(const UHashTok key1, const UHashTok key2) {
return U_NAMESPACE_QUALIFIER DictTrieBuilder::equalNodes(key1.pointer, key2.pointer);
}
U_CDECL_END
U_NAMESPACE_BEGIN
DictTrieBuilder::DictTrieBuilder() : nodes(NULL) {}
DictTrieBuilder::~DictTrieBuilder() {
deleteCompactBuilder();
}
void
DictTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
nodes=uhash_openSize(hashDictTrieNode, equalDictTrieNodes, NULL,
sizeGuess, &errorCode);
if(U_SUCCESS(errorCode) && nodes==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
if(U_SUCCESS(errorCode)) {
uhash_setKeyDeleter(nodes, uhash_deleteUObject);
}
}
void
DictTrieBuilder::deleteCompactBuilder() {
uhash_close(nodes);
nodes=NULL;
}
DictTrieBuilder::Node *
DictTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
if(newNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
const UHashElement *old=uhash_find(nodes, newNode);
if(old!=NULL) {
delete newNode;
return (Node *)old->key.pointer;
}
// If uhash_puti() returns a non-zero value from an equivalent, previously
// registered node, then uhash_find() failed to find that and we will leak newNode.
#if !U_RELEASE
int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
#endif
uhash_puti(nodes, newNode, 1, &errorCode);
U_ASSERT(oldValue==0);
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
return newNode;
}
DictTrieBuilder::Node *
DictTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
FinalValueNode key(value);
const UHashElement *old=uhash_find(nodes, &key);
if(old!=NULL) {
return (Node *)old->key.pointer;
}
Node *newNode=createFinalValueNode(value);
if(newNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// If uhash_puti() returns a non-zero value from an equivalent, previously
// registered node, then uhash_find() failed to find that and we will leak newNode.
#if !U_RELEASE
int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
#endif
uhash_puti(nodes, newNode, 1, &errorCode);
U_ASSERT(oldValue==0);
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
return newNode;
}
UBool DictTrieBuilder::hashNode(const void *node) {
return ((const Node *)node)->hashCode();
}
UBool DictTrieBuilder::equalNodes(const void *left, const void *right) {
return *(const Node *)left==*(const Node *)right;
}
UBool DictTrieBuilder::Node::operator==(const Node &other) const {
return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
}
int32_t DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber;
}
return edgeNumber;
}
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(DictTrieBuilder::Node)
UBool DictTrieBuilder::FinalValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
// Not:
// if(!Node::operator==(other)) {
// return FALSE;
// }
// because registerFinalValue() compares a stack-allocated FinalValueNode
// (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
// with the specific builder's subclass of FinalValueNode,
// and !Node::operator==(other) will always be false for that because it
// compares the typeid's.
// This workaround assumes that the subclass does not add fields that need to be compared.
if(hash!=other.hashCode()) {
return FALSE;
}
const FinalValueNode *o=dynamic_cast<const FinalValueNode *>(&other);
return o!=NULL && value==o->value;
}
UBool DictTrieBuilder::ValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const ValueNode &o=(const ValueNode &)other;
return hasValue==o.hasValue && (!hasValue || value==o.value);
}
UBool DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const LinearMatchNode &o=(const LinearMatchNode &)other;
return length==o.length && next==o.next;
}
int32_t DictTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
UBool DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const ListBranchNode &o=(const ListBranchNode &)other;
for(int32_t i=0; i<length; ++i) {
if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
return FALSE;
}
}
return TRUE;
}
int32_t DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
int32_t step=0;
int32_t i=length;
do {
Node *edge=equal[--i];
if(edge!=NULL) {
edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
}
// For all but the rightmost edge, decrement the edge number.
step=1;
} while(i>0);
offset=edgeNumber;
}
return edgeNumber;
}
UBool DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const SplitBranchNode &o=(const SplitBranchNode &)other;
return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
}
int32_t DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
}
return edgeNumber;
}
UBool DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const BranchHeadNode &o=(const BranchHeadNode &)other;
return length==o.length && next==o.next;
}
int32_t DictTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
U_NAMESPACE_END

View file

@ -0,0 +1,251 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: dicttriebuilder.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec24
* created by: Markus W. Scherer
*
* Base class for dictionary-trie builder classes.
*/
#ifndef __DICTTRIEBUILDER_H__
#define __DICTTRIEBUILDER_H__
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "uhash.h"
enum UDictTrieBuildOption {
UDICTTRIE_BUILD_FAST,
UDICTTRIE_BUILD_SMALL
};
U_NAMESPACE_BEGIN
class U_TOOLUTIL_API DictTrieBuilder : public UMemory {
public:
/** @internal */
static UBool hashNode(const void *node);
/** @internal */
static UBool equalNodes(const void *left, const void *right);
protected:
DictTrieBuilder();
~DictTrieBuilder();
class Node;
void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
void deleteCompactBuilder();
/**
* Makes sure that there is only one unique node registered that is
* equivalent to newNode.
* @param newNode Input node. The builder takes ownership.
* @param errorCode ICU in/out UErrorCode.
Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
* @return newNode if it is the first of its kind, or
* an equivalent node if newNode is a duplicate.
*/
Node *registerNode(Node *newNode, UErrorCode &errorCode);
/**
* Makes sure that there is only one unique FinalValueNode registered
* with this value.
* Avoids creating a node if the value is a duplicate.
* @param value A final value.
* @param errorCode ICU in/out UErrorCode.
Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
* @return A FinalValueNode with the given value.
*/
Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
/*
* C++ note:
* registerNode() and registerFinalValue() take ownership of their input nodes,
* and only return owned nodes.
* If they see a failure UErrorCode, they will delete the input node.
* If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
* If there is a failure, they return NULL.
*
* NULL Node pointers can be safely passed into other Nodes because
* they call the static Node::hashCode() which checks for a NULL pointer first.
*
* Therefore, as long as builder functions register a new node,
* they need to check for failures only before explicitly dereferencing
* a Node pointer, or before setting a new UErrorCode.
*/
virtual Node *createFinalValueNode(int32_t value) const = 0;
// Hash set of nodes, maps from nodes to integer 1.
UHashtable *nodes;
class Node : public UObject {
public:
Node(int32_t initialHash) : hash(initialHash), offset(0) {}
inline int32_t hashCode() const { return hash; }
// Handles node==NULL.
static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
// Base class operator==() compares the actual class types.
virtual UBool operator==(const Node &other) const;
inline UBool operator!=(const Node &other) const { return !operator==(other); }
/**
* Traverses the Node graph and numbers branch edges, with rightmost edges first.
* This is to avoid writing a duplicate node twice.
*
* Branch nodes in this trie data structure are not symmetric.
* Most branch edges "jump" to other nodes but the rightmost branch edges
* just continue without a jump.
* Therefore, write() must write the rightmost branch edge last
* (trie units are written backwards), and must write it at that point even if
* it is a duplicate of a node previously written elsewhere.
*
* This function visits and marks right branch edges first.
* Edges are numbered with increasingly negative values because we share the
* offset field which gets positive values when nodes are written.
* A branch edge also remembers the first number for any of its edges.
*
* When a further-left branch edge has a number in the range of the rightmost
* edge's numbers, then it will be written as part of the required right edge
* and we can avoid writing it first.
*
* After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
* edge numbers.
*
* @param edgeNumber The first edge number for this node and its sub-nodes.
* @return An edge number that is at least the maximum-negative
* of the input edge number and the numbers of this node and all of its sub-nodes.
*/
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
// write() must set the offset to a positive value.
virtual void write(DictTrieBuilder &builder) = 0;
// See markRightEdgesFirst.
inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
DictTrieBuilder &builder) {
// Note: Edge numbers are negative, lastRight<=firstRight.
// If offset>0 then this node and its sub-nodes have been written already
// and we need not write them again.
// If this node is part of the unwritten right branch edge,
// then we wait until that is written.
if(offset<0 && (offset<lastRight || firstRight<offset)) {
write(builder);
}
}
inline int32_t getOffset() const { return offset; }
protected:
int32_t hash;
int32_t offset;
private:
// No ICU "poor man's RTTI" for this class nor its subclasses.
virtual UClassID getDynamicClassID() const;
};
class FinalValueNode : public Node {
public:
FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
virtual UBool operator==(const Node &other) const;
// Dummy default implementation, must be overridden for real writing.
virtual void write(DictTrieBuilder & /*builder*/) {}
protected:
int32_t value;
};
class ValueNode : public Node {
public:
ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
virtual UBool operator==(const Node &other) const;
void setValue(int32_t v) {
hasValue=TRUE;
value=v;
hash=hash*37+v;
}
protected:
UBool hasValue;
int32_t value;
};
class LinearMatchNode : public ValueNode {
public:
LinearMatchNode(int32_t len, Node *nextNode)
: ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
length(len), next(nextNode) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
protected:
int32_t length;
Node *next;
};
class BranchNode : public Node {
public:
BranchNode(int32_t initialHash) : Node(initialHash) {}
protected:
int32_t firstEdgeNumber;
};
class ListBranchNode : public BranchNode {
public:
ListBranchNode() : BranchNode(0x444444), length(0) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
// Adds a unit with a final value.
void add(int32_t c, int32_t value) {
units[length]=(UChar)c;
equal[length]=NULL;
values[length]=value;
++length;
hash=(hash*37+c)*37+value;
}
// Adds a unit which leads to another match node.
void add(int32_t c, Node *node) {
units[length]=(UChar)c;
equal[length]=node;
values[length]=0;
++length;
hash=(hash*37+c)*37+hashCode(node);
}
protected:
// TODO: 10 -> max(BT/UCT max list lengths)
Node *equal[10]; // NULL means "has final value".
int32_t length;
int32_t values[10];
UChar units[10];
};
class SplitBranchNode : public BranchNode {
public:
SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
: BranchNode(((0x555555*37+middleUnit)*37+
hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
protected:
UChar unit;
Node *lessThan;
Node *greaterOrEqual;
};
// Branch head node, for writing the actual node lead unit.
class BranchHeadNode : public ValueNode {
public:
BranchHeadNode(int32_t len, Node *subNode)
: ValueNode((0x666666*37+len)*37+hashCode(subNode)),
length(len), next(subNode) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
protected:
int32_t length;
Node *next; // A branch sub-node.
};
};
U_NAMESPACE_END
#endif // __DICTTRIEBUILDER_H__

View file

@ -68,9 +68,94 @@
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
/* Unicode property (value) aliases data swapping --------------------------- */
static int32_t
upname_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
/* udata_swapDataHeader checks the arguments */
int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode);
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
/* check data format and format version */
const UDataInfo *pInfo=
reinterpret_cast<const UDataInfo *>(
reinterpret_cast<const char *>(inData)+4);
if(!(
pInfo->dataFormat[0]==0x70 && /* dataFormat="pnam" */
pInfo->dataFormat[1]==0x6e &&
pInfo->dataFormat[2]==0x61 &&
pInfo->dataFormat[3]==0x6d &&
pInfo->formatVersion[0]==2
)) {
udata_printError(ds, "upname_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as pnames.icu\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
pInfo->dataFormat[2], pInfo->dataFormat[3],
pInfo->formatVersion[0]);
*pErrorCode=U_UNSUPPORTED_ERROR;
return 0;
}
const uint8_t *inBytes=reinterpret_cast<const uint8_t *>(inData)+headerSize;
uint8_t *outBytes=reinterpret_cast<uint8_t *>(outData)+headerSize;
if(length>=0) {
length-=headerSize;
// formatVersion 2 initially has indexes[8], 32 bytes.
if(length<32) {
udata_printError(ds, "upname_swap(): too few bytes (%d after header) for pnames.icu\n",
(int)length);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
const int32_t *inIndexes=reinterpret_cast<const int32_t *>(inBytes);
int32_t totalSize=udata_readInt32(ds, inIndexes[PropNameData::IX_TOTAL_SIZE]);
if(length>=0) {
if(length<totalSize) {
udata_printError(ds, "upname_swap(): too few bytes (%d after header, should be %d) "
"for pnames.icu\n",
(int)length, (int)totalSize);
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
int32_t numBytesIndexesAndValueMaps=
udata_readInt32(ds, inIndexes[PropNameData::IX_BYTE_TRIES_OFFSET]);
// Swap the indexes[] and the valueMaps[].
ds->swapArray32(ds, inBytes, numBytesIndexesAndValueMaps, outBytes, pErrorCode);
// Copy the rest of the data.
if(inBytes!=outBytes) {
uprv_memcpy(outBytes+numBytesIndexesAndValueMaps,
inBytes+numBytesIndexesAndValueMaps,
totalSize-numBytesIndexesAndValueMaps);
}
// We need not swap anything else:
//
// The ByteTries are already byte-serialized, and are fixed on ASCII.
// (On an EBCDIC machine, the input string is converted to lowercase ASCII
// while matching.)
//
// The name groups are mostly invariant characters, but since we only
// generate, and keep in subversion, ASCII versions of pnames.icu,
// and since only ICU4J uses the pnames.icu data file
// (the data is hardcoded in ICU4C) and ICU4J uses ASCII data files,
// we just copy those bytes too.
}
return headerSize+totalSize;
}
/* Unicode properties data swapping ----------------------------------------- */
U_CAPI int32_t U_EXPORT2
static int32_t
uprops_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
@ -218,7 +303,7 @@ uprops_swap(const UDataSwapper *ds,
/* Unicode case mapping data swapping --------------------------------------- */
U_CAPI int32_t U_EXPORT2
static int32_t
ucase_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
@ -320,7 +405,7 @@ ucase_swap(const UDataSwapper *ds,
/* Unicode bidi/shaping data swapping --------------------------------------- */
U_CAPI int32_t U_EXPORT2
static int32_t
ubidi_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
@ -428,7 +513,7 @@ ubidi_swap(const UDataSwapper *ds,
#if !UCONFIG_NO_NORMALIZATION
U_CAPI int32_t U_EXPORT2
static int32_t
unorm_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {
@ -552,7 +637,7 @@ unorm_swap(const UDataSwapper *ds,
#endif
/* Swap 'Test' data from gentest */
U_CAPI int32_t U_EXPORT2
static int32_t
test_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode) {

View file

@ -246,6 +246,10 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="bytetriebuilder.cpp" />
<ClCompile Include="bytetrieiterator.cpp" />
<ClCompile Include="denseranges.cpp" />
<ClCompile Include="dicttriebuilder.cpp" />
<ClCompile Include="filestrm.c" />
<ClCompile Include="filetools.cpp" />
<ClCompile Include="flagparser.c" />
@ -272,6 +276,9 @@
<DisableLanguageExtensions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</DisableLanguageExtensions>
</ClCompile>
<ClCompile Include="ucbuf.c" />
<ClCompile Include="uchartrie.cpp" />
<ClCompile Include="uchartriebuilder.cpp" />
<ClCompile Include="uchartrieiterator.cpp" />
<ClCompile Include="ucm.c" />
<ClCompile Include="ucmstate.c" />
<ClCompile Include="unewdata.c" />
@ -289,6 +296,10 @@
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="bytetriebuilder.h" />
<ClInclude Include="bytetrieiterator.h" />
<ClInclude Include="denseranges.h" />
<ClInclude Include="dicttriebuilder.h" />
<ClInclude Include="filestrm.h" />
<ClInclude Include="filetools.h" />
<ClInclude Include="flagparser.h" />
@ -301,6 +312,9 @@
<ClInclude Include="swapimpl.h" />
<ClInclude Include="toolutil.h" />
<ClInclude Include="ucbuf.h" />
<ClInclude Include="uchartrie.h" />
<ClInclude Include="uchartriebuilder.h" />
<ClInclude Include="uchartrieiterator.h" />
<ClInclude Include="ucm.h" />
<ClInclude Include="unewdata.h" />
<ClInclude Include="uoptions.h" />
@ -323,4 +337,4 @@
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>
</Project>

View file

@ -0,0 +1,414 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uchartrie.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov14
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "uassert.h"
#include "uchartrie.h"
U_NAMESPACE_BEGIN
Appendable &
Appendable::appendCodePoint(UChar32 c) {
if(c<=0xffff) {
return append((UChar)c);
} else {
return append(U16_LEAD(c)).append(U16_TRAIL(c));
}
}
Appendable &
Appendable::append(const UChar *s, int32_t length) {
if(s!=NULL && length!=0) {
if(length<0) {
UChar c;
while((c=*s++)!=0) {
append(c);
}
} else {
const UChar *limit=s+length;
while(s<limit) {
append(*s++);
}
}
}
return *this;
}
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(Appendable)
UDictTrieResult
UCharTrie::current() const {
const UChar *pos=pos_;
if(pos==NULL) {
return UDICTTRIE_NO_MATCH;
} else {
int32_t node;
return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
}
}
UDictTrieResult
UCharTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) {
// Branch according to the current unit.
if(length==0) {
length=*pos++;
}
++length;
// The length of the branch is the number of units to select from.
// The data structure encodes a binary search.
while(length>kMaxBranchLinearSubNodeLength) {
if(uchar<*pos++) {
length>>=1;
pos=jumpByDelta(pos);
} else {
length=length-(length>>1);
pos=skipDelta(pos);
}
}
// Drop down to linear search for the last few units.
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
// and divides length by 2.
do {
if(uchar==*pos++) {
UDictTrieResult result;
int32_t node=*pos;
if(node&kValueIsFinal) {
// Leave the final value for getValue() to read.
result=UDICTTRIE_HAS_FINAL_VALUE;
} else {
// Use the non-final value as the jump delta.
++pos;
// int32_t delta=readValue(pos, node>>1);
int32_t delta;
if(node<kMinTwoUnitValueLead) {
delta=node;
} else if(node<kThreeUnitValueLead) {
delta=((node-kMinTwoUnitValueLead)<<16)|*pos++;
} else {
delta=(pos[0]<<16)|pos[1];
pos+=2;
}
// end readValue()
pos+=delta;
node=*pos;
result= node>=kMinValueLead ? valueResult(node) : UDICTTRIE_NO_VALUE;
}
pos_=pos;
return result;
}
--length;
pos=skipValue(pos);
} while(length>1);
if(uchar==*pos++) {
pos_=pos;
int32_t node=*pos;
return node>=kMinValueLead ? valueResult(node) : UDICTTRIE_NO_VALUE;
} else {
stop();
return UDICTTRIE_NO_MATCH;
}
}
UDictTrieResult
UCharTrie::nextImpl(const UChar *pos, int32_t uchar) {
int32_t node=*pos++;
for(;;) {
if(node<kMinLinearMatch) {
return branchNext(pos, node, uchar);
} else if(node<kMinValueLead) {
// Match the first of length+1 units.
int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
if(uchar==*pos++) {
remainingMatchLength_=--length;
pos_=pos;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
} else {
// No match.
break;
}
} else if(node&kValueIsFinal) {
// No further matching units.
break;
} else {
// Skip intermediate value.
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
stop();
return UDICTTRIE_NO_MATCH;
}
UDictTrieResult
UCharTrie::next(int32_t uchar) {
const UChar *pos=pos_;
if(pos==NULL) {
return UDICTTRIE_NO_MATCH;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Remaining part of a linear-match node.
if(uchar==*pos++) {
remainingMatchLength_=--length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
} else {
stop();
return UDICTTRIE_NO_MATCH;
}
}
return nextImpl(pos, uchar);
}
UDictTrieResult
UCharTrie::next(const UChar *s, int32_t sLength) {
if(sLength<0 ? *s==0 : sLength==0) {
// Empty input.
return current();
}
const UChar *pos=pos_;
if(pos==NULL) {
return UDICTTRIE_NO_MATCH;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
for(;;) {
// Fetch the next input unit, if there is one.
// Continue a linear-match node without rechecking sLength<0.
int32_t uchar;
if(sLength<0) {
for(;;) {
if((uchar=*s++)==0) {
remainingMatchLength_=length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
}
if(length<0) {
remainingMatchLength_=length;
break;
}
if(uchar!=*pos) {
stop();
return UDICTTRIE_NO_MATCH;
}
++pos;
--length;
}
} else {
for(;;) {
if(sLength==0) {
remainingMatchLength_=length;
pos_=pos;
int32_t node;
return (length<0 && (node=*pos)>=kMinValueLead) ?
valueResult(node) : UDICTTRIE_NO_VALUE;
}
uchar=*s++;
--sLength;
if(length<0) {
remainingMatchLength_=length;
break;
}
if(uchar!=*pos) {
stop();
return UDICTTRIE_NO_MATCH;
}
++pos;
--length;
}
}
int32_t node=*pos++;
for(;;) {
if(node<kMinLinearMatch) {
UDictTrieResult result=branchNext(pos, node, uchar);
if(result==UDICTTRIE_NO_MATCH) {
return UDICTTRIE_NO_MATCH;
}
// Fetch the next input unit, if there is one.
if(sLength<0) {
if((uchar=*s++)==0) {
return result;
}
} else {
if(sLength==0) {
return result;
}
uchar=*s++;
--sLength;
}
if(result==UDICTTRIE_HAS_FINAL_VALUE) {
// No further matching units.
stop();
return UDICTTRIE_NO_MATCH;
}
pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
node=*pos++;
} else if(node<kMinValueLead) {
// Match length+1 units.
length=node-kMinLinearMatch; // Actual match length minus 1.
if(uchar!=*pos) {
stop();
return UDICTTRIE_NO_MATCH;
}
++pos;
--length;
break;
} else if(node&kValueIsFinal) {
// No further matching units.
stop();
return UDICTTRIE_NO_MATCH;
} else {
// Skip intermediate value.
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
}
}
const UChar *
UCharTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length,
UBool haveUniqueValue, int32_t &uniqueValue) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
return NULL;
}
length=length-(length>>1);
pos=skipDelta(pos);
}
do {
++pos; // ignore a comparison unit
// handle its value
int32_t node=*pos++;
UBool isFinal=(UBool)(node>>15);
node&=0x7fff;
int32_t value=readValue(pos, node);
pos=skipValue(pos, node);
if(isFinal) {
if(haveUniqueValue) {
if(value!=uniqueValue) {
return NULL;
}
} else {
uniqueValue=value;
haveUniqueValue=TRUE;
}
} else {
if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
return NULL;
}
haveUniqueValue=TRUE;
}
} while(--length>1);
return pos+1; // ignore the last comparison unit
}
UBool
UCharTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
int32_t node=*pos++;
for(;;) {
if(node<kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
if(pos==NULL) {
return FALSE;
}
haveUniqueValue=TRUE;
node=*pos++;
} else if(node<kMinValueLead) {
// linear-match node
pos+=node-kMinLinearMatch+1; // Ignore the match units.
node=*pos++;
} else {
UBool isFinal=(UBool)(node>>15);
int32_t value;
if(isFinal) {
value=readValue(pos, node&0x7fff);
} else {
value=readNodeValue(pos, node);
}
if(haveUniqueValue) {
if(value!=uniqueValue) {
return FALSE;
}
} else {
uniqueValue=value;
haveUniqueValue=TRUE;
}
if(isFinal) {
return TRUE;
}
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
}
int32_t
UCharTrie::getNextUChars(Appendable &out) const {
const UChar *pos=pos_;
if(pos==NULL) {
return 0;
}
if(remainingMatchLength_>=0) {
out.append(*pos); // Next unit of a pending linear-match node.
return 1;
}
int32_t node=*pos++;
if(node>=kMinValueLead) {
if(node&kValueIsFinal) {
return 0;
} else {
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
if(node<kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
getNextBranchUChars(pos, ++node, out);
return node;
} else {
// First unit of the linear-match node.
out.append(*pos);
return 1;
}
}
void
UCharTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
getNextBranchUChars(jumpByDelta(pos), length>>1, out);
length=length-(length>>1);
pos=skipDelta(pos);
}
do {
out.append(*pos++);
pos=skipValue(pos);
} while(--length>1);
out.append(*pos);
}
U_NAMESPACE_END

View file

@ -0,0 +1,433 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uchartrie.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov14
* created by: Markus W. Scherer
*/
#ifndef __UCHARTRIE_H__
#define __UCHARTRIE_H__
/**
* \file
* \brief C++ API: Dictionary trie for mapping Unicode strings (or 16-bit-unit sequences)
* to integer values.
*/
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "uassert.h"
#include "udicttrie.h"
U_NAMESPACE_BEGIN
class UCharTrieBuilder;
class UCharTrieIterator;
/**
* Base class for objects to which Unicode characters and strings can be appended.
* Combines elements of Java Appendable and ICU4C ByteSink.
* TODO: Should live in separate files, could be public API.
*/
class U_TOOLUTIL_API Appendable : public UObject {
public:
/**
* Appends a 16-bit code unit.
* @param c code unit
* @return *this
*/
virtual Appendable &append(UChar c) = 0;
/**
* Appends a code point; has a default implementation.
* @param c code point
* @return *this
*/
virtual Appendable &appendCodePoint(UChar32 c);
/**
* Appends a string; has a default implementation.
* @param s string
* @param length string length, or -1 if NUL-terminated
* @return *this
*/
virtual Appendable &append(const UChar *s, int32_t length);
// TODO: getAppendBuffer(), see ByteSink
// TODO: flush() (?) see ByteSink
private:
// No ICU "poor man's RTTI" for this class nor its subclasses.
virtual UClassID getDynamicClassID() const;
};
/**
* Light-weight, non-const reader class for a UCharTrie.
* Traverses a UChar-serialized data structure with minimal state,
* for mapping strings (16-bit-unit sequences) to non-negative integer values.
*/
class U_TOOLUTIL_API UCharTrie : public UMemory {
public:
UCharTrie(const UChar *trieUChars)
: uchars_(trieUChars),
pos_(uchars_), remainingMatchLength_(-1) {}
/**
* Resets this trie to its initial state.
*/
UCharTrie &reset() {
pos_=uchars_;
remainingMatchLength_=-1;
return *this;
}
/**
* UCharTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
*/
class State : public UMemory {
public:
State() { uchars=NULL; }
private:
friend class UCharTrie;
const UChar *uchars;
const UChar *pos;
int32_t remainingMatchLength;
};
/**
* Saves the state of this trie.
* @see resetToState
*/
const UCharTrie &saveState(State &state) const {
state.uchars=uchars_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return *this;
}
/**
* Resets this trie to the saved state.
* If the state object contains no state, or the state of a different trie,
* then this trie remains unchanged.
* @see saveState
* @see reset
*/
UCharTrie &resetToState(const State &state) {
if(uchars_==state.uchars && uchars_!=NULL) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
}
return *this;
}
/**
* Determines whether the string so far matches, whether it has a value,
* and whether another input UChar can continue a matching string.
* @return The match/value Result.
*/
UDictTrieResult current() const;
/**
* Traverses the trie from the initial state for this input UChar.
* Equivalent to reset().next(uchar).
* @return The match/value Result.
*/
inline UDictTrieResult first(int32_t uchar) {
remainingMatchLength_=-1;
return nextImpl(uchars_, uchar);
}
/**
* Traverses the trie from the initial state for the
* one or two UTF-16 code units for this input code point.
* Equivalent to reset().nextForCodePoint(cp).
* @return The match/value Result.
*/
inline UDictTrieResult firstForCodePoint(UChar32 cp) {
return cp<=0xffff ?
first(cp) :
(first(U16_LEAD(cp))!=UDICTTRIE_NO_MATCH ?
next(U16_TRAIL(cp)) :
UDICTTRIE_NO_MATCH);
}
/**
* Traverses the trie from the current state for this input UChar.
* @return The match/value Result.
*/
UDictTrieResult next(int32_t uchar);
/**
* Traverses the trie from the current state for the
* one or two UTF-16 code units for this input code point.
* @return The match/value Result.
*/
inline UDictTrieResult nextForCodePoint(UChar32 cp) {
return cp<=0xffff ?
next(cp) :
(next(U16_LEAD(cp))!=UDICTTRIE_NO_MATCH ?
next(U16_TRAIL(cp)) :
UDICTTRIE_NO_MATCH);
}
/**
* Traverses the trie from the current state for this string.
* Equivalent to
* \code
* Result result=current();
* for(each c in s)
* if((result=next(c))==UDICTTRIE_NO_MATCH) return UDICTTRIE_NO_MATCH;
* return result;
* \endcode
* @return The match/value Result.
*/
UDictTrieResult next(const UChar *s, int32_t length);
/**
* Returns a matching string's value if called immediately after
* current()/first()/next() returned UDICTTRIE_HAS_VALUE or UDICTTRIE_HAS_FINAL_VALUE.
* getValue() can be called multiple times.
*
* Do not call getValue() after UDICTTRIE_NO_MATCH or UDICTTRIE_NO_VALUE!
*/
inline int32_t getValue() const {
const UChar *pos=pos_;
int32_t leadUnit=*pos++;
U_ASSERT(leadUnit>=kMinValueLead);
return leadUnit&kValueIsFinal ?
readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
}
/**
* Determines whether all strings reachable from the current state
* map to the same value.
* @param uniqueValue Receives the unique value, if this function returns TRUE.
* (output-only)
* @return TRUE if all strings reachable from the current state
* map to the same value.
*/
inline UBool hasUniqueValue(int32_t &uniqueValue) const {
const UChar *pos=pos_;
// Skip the rest of a pending linear-match node.
return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
}
/**
* Finds each UChar which continues the string from the current state.
* That is, each UChar c for which it would be next(c)!=UDICTTRIE_NO_MATCH now.
* @param out Each next UChar is appended to this object.
* (Only uses the out.append(c) method.)
* @return the number of UChars which continue the string from here
*/
int32_t getNextUChars(Appendable &out) const;
private:
friend class UCharTrieBuilder;
friend class UCharTrieIterator;
inline void stop() {
pos_=NULL;
}
// Reads a compact 32-bit integer.
// pos is already after the leadUnit, and the lead unit has bit 15 reset.
static inline int32_t readValue(const UChar *pos, int32_t leadUnit) {
int32_t value;
if(leadUnit<kMinTwoUnitValueLead) {
value=leadUnit;
} else if(leadUnit<kThreeUnitValueLead) {
value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
} else {
value=(pos[0]<<16)|pos[1];
}
return value;
}
static inline const UChar *skipValue(const UChar *pos, int32_t leadUnit) {
if(leadUnit>=kMinTwoUnitValueLead) {
if(leadUnit<kThreeUnitValueLead) {
++pos;
} else {
pos+=2;
}
}
return pos;
}
static inline const UChar *skipValue(const UChar *pos) {
int32_t leadUnit=*pos++;
return skipValue(pos, leadUnit&0x7fff);
}
static inline int32_t readNodeValue(const UChar *pos, int32_t leadUnit) {
U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
int32_t value;
if(leadUnit<kMinTwoUnitNodeValueLead) {
value=(leadUnit>>6)-1;
} else if(leadUnit<kThreeUnitNodeValueLead) {
value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
} else {
value=(pos[0]<<16)|pos[1];
}
return value;
}
static inline const UChar *skipNodeValue(const UChar *pos, int32_t leadUnit) {
U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
if(leadUnit>=kMinTwoUnitNodeValueLead) {
if(leadUnit<kThreeUnitNodeValueLead) {
++pos;
} else {
pos+=2;
}
}
return pos;
}
static inline const UChar *jumpByDelta(const UChar *pos) {
int32_t delta=*pos++;
if(delta>=kMinTwoUnitDeltaLead) {
if(delta==kThreeUnitDeltaLead) {
delta=(pos[0]<<16)|pos[1];
pos+=2;
} else {
delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
}
}
return pos+delta;
}
static const UChar *skipDelta(const UChar *pos) {
int32_t delta=*pos++;
if(delta>=kMinTwoUnitDeltaLead) {
if(delta==kThreeUnitDeltaLead) {
pos+=2;
} else {
++pos;
}
}
return pos;
}
static inline UDictTrieResult valueResult(int32_t node) {
return (UDictTrieResult)(UDICTTRIE_HAS_VALUE-(node>>15));
}
// Handles a branch node for both next(uchar) and next(string).
UDictTrieResult branchNext(const UChar *pos, int32_t length, int32_t uchar);
// Requires remainingLength_<0.
UDictTrieResult nextImpl(const UChar *pos, int32_t uchar);
// Helper functions for hasUniqueValue().
// Recursively finds a unique value (or whether there is not a unique one)
// from a branch.
static const UChar *findUniqueValueFromBranch(const UChar *pos, int32_t length,
UBool haveUniqueValue, int32_t &uniqueValue);
// Recursively finds a unique value (or whether there is not a unique one)
// starting from a position on a node lead unit.
static UBool findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue);
// Helper functions for getNextUChars().
// getNextUChars() when pos is on a branch node.
static void getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out);
// UCharTrie data structure
//
// The trie consists of a series of UChar-serialized nodes for incremental
// Unicode string/UChar sequence matching. (UChar=16-bit unsigned integer)
// The root node is at the beginning of the trie data.
//
// Types of nodes are distinguished by their node lead unit ranges.
// After each node, except a final-value node, another node follows to
// encode match values or continue matching further units.
//
// Node types:
// - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
// The value is for the string/UChar sequence so far.
// - Match node, optionally with an intermediate value in a different compact format.
// The value, if present, is for the string/UChar sequence so far.
//
// Aside from the value, which uses the node lead unit's high bits:
//
// - Linear-match node: Matches a number of units.
// - Branch node: Branches to other nodes according to the current input unit.
// The node unit is the length of the branch (number of units to select from)
// minus 1. It is followed by a sub-node:
// - If the length is at most kMaxBranchLinearSubNodeLength, then
// there are length-1 (key, value) pairs and then one more comparison unit.
// If one of the key units matches, then the value is either a final value for
// the string so far, or a "jump" delta to the next node.
// If the last unit matches, then matching continues with the next node.
// (Values have the same encoding as final-value nodes.)
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
// there is one unit and one "jump" delta.
// If the input unit is less than the sub-node unit, then "jump" by delta to
// the next sub-node which will have a length of length/2.
// (The delta has its own compact encoding.)
// Otherwise, skip the "jump" delta to the next sub-node
// which will have a length of length-length/2.
// Match-node lead unit values, after masking off intermediate-value bits:
// 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next unit.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
static const int32_t kMaxBranchLinearSubNodeLength=5;
// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
static const int32_t kMinLinearMatch=0x30;
static const int32_t kMaxLinearMatchLength=0x10;
// Match-node lead unit bits 14..6 for the optional intermediate value.
// If these bits are 0, then there is no intermediate value.
// Otherwise, see the *NodeValue* constants below.
static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
// A final-value node has bit 15 set.
static const int32_t kValueIsFinal=0x8000;
// Compact value: After testing and masking off bit 15, use the following thresholds.
static const int32_t kMaxOneUnitValue=0x3fff;
static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
static const int32_t kThreeUnitValueLead=0x7fff;
static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
// Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
static const int32_t kMaxOneUnitNodeValue=0xff;
static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
static const int32_t kThreeUnitNodeValueLead=0x7fc0;
static const int32_t kMaxTwoUnitNodeValue=
((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
// Compact delta integers.
static const int32_t kMaxOneUnitDelta=0xfbff;
static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
static const int32_t kThreeUnitDeltaLead=0xffff;
static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
// Fixed value referencing the UCharTrie words.
const UChar *uchars_;
// Iterator variables.
// Pointer to next trie unit to read. NULL if no more matches.
const UChar *pos_;
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
int32_t remainingMatchLength_;
};
U_NAMESPACE_END
#endif // __UCHARTRIE_H__

View file

@ -0,0 +1,696 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uchartriebuilder.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov14
* created by: Markus W. Scherer
*
* Builder class for UCharTrie dictionary trie.
*/
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "uarrsort.h"
#include "uchartrie.h"
#include "uchartriebuilder.h"
U_NAMESPACE_BEGIN
/*
* Note: This builder implementation stores (string, value) pairs with full copies
* of the 16-bit-unit sequences, until the UCharTrie is built.
* It might(!) take less memory if we collected the data in a temporary, dynamic trie.
*/
class UCharTrieElement : public UMemory {
public:
// Use compiler's default constructor, initializes nothing.
void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
UnicodeString getString(const UnicodeString &strings) const {
int32_t length=strings[stringOffset];
return strings.tempSubString(stringOffset+1, length);
}
int32_t getStringLength(const UnicodeString &strings) const {
return strings[stringOffset];
}
UChar charAt(int32_t index, const UnicodeString &strings) const {
return strings[stringOffset+1+index];
}
int32_t getValue() const { return value; }
int32_t compareStringTo(const UCharTrieElement &o, const UnicodeString &strings) const;
private:
// The first strings unit contains the string length.
// (Compared with a stringLength field here, this saves 2 bytes per string.)
int32_t stringOffset;
int32_t value;
};
void
UCharTrieElement::setTo(const UnicodeString &s, int32_t val,
UnicodeString &strings, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
int32_t length=s.length();
if(length>0xffff) {
// Too long: We store the length in 1 unit.
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
stringOffset=strings.length();
strings.append((UChar)length);
value=val;
strings.append(s);
}
int32_t
UCharTrieElement::compareStringTo(const UCharTrieElement &other, const UnicodeString &strings) const {
return getString(strings).compare(other.getString(strings));
}
UCharTrieBuilder::~UCharTrieBuilder() {
delete[] elements;
uprv_free(uchars);
}
UCharTrieBuilder &
UCharTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return *this;
}
if(ucharsLength>0) {
// Cannot add elements after building.
errorCode=U_NO_WRITE_PERMISSION;
return *this;
}
ucharsCapacity+=s.length()+1; // Crude uchars preallocation estimate.
if(elementsLength==elementsCapacity) {
int32_t newCapacity;
if(elementsCapacity==0) {
newCapacity=1024;
} else {
newCapacity=4*elementsCapacity;
}
UCharTrieElement *newElements=new UCharTrieElement[newCapacity];
if(newElements==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
if(elementsLength>0) {
uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharTrieElement));
}
delete[] elements;
elements=newElements;
elementsCapacity=newCapacity;
}
elements[elementsLength++].setTo(s, value, strings, errorCode);
if(U_SUCCESS(errorCode) && strings.isBogus()) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
return *this;
}
U_CDECL_BEGIN
static int32_t U_CALLCONV
compareElementStrings(const void *context, const void *left, const void *right) {
const UnicodeString *strings=reinterpret_cast<const UnicodeString *>(context);
const UCharTrieElement *leftElement=reinterpret_cast<const UCharTrieElement *>(left);
const UCharTrieElement *rightElement=reinterpret_cast<const UCharTrieElement *>(right);
return leftElement->compareStringTo(*rightElement, *strings);
}
U_CDECL_END
UnicodeString &
UCharTrieBuilder::build(UDictTrieBuildOption buildOption, UnicodeString &result, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return result;
}
if(ucharsLength>0) {
// Already built.
result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
return result;
}
if(elementsLength==0) {
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return result;
}
if(strings.isBogus()) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return result;
}
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharTrieElement),
compareElementStrings, &strings,
FALSE, // need not be a stable sort
&errorCode);
if(U_FAILURE(errorCode)) {
return result;
}
// Duplicate strings are not allowed.
UnicodeString prev=elements[0].getString(strings);
for(int32_t i=1; i<elementsLength; ++i) {
UnicodeString current=elements[i].getString(strings);
if(prev==current) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return result;
}
prev.fastCopyFrom(current);
}
// Create and UChar-serialize the trie for the elements.
if(ucharsCapacity<1024) {
ucharsCapacity=1024;
}
uchars=reinterpret_cast<UChar *>(uprv_malloc(ucharsCapacity*2));
if(uchars==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return result;
}
if(buildOption==UDICTTRIE_BUILD_FAST) {
writeNode(0, elementsLength, 0);
} else /* UDICTTRIE_BUILD_SMALL */ {
createCompactBuilder(2*elementsLength, errorCode);
Node *root=makeNode(0, elementsLength, 0, errorCode);
if(U_SUCCESS(errorCode)) {
root->markRightEdgesFirst(-1);
root->write(*this);
}
deleteCompactBuilder();
}
if(uchars==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
}
return result;
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
void
UCharTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
UBool hasValue=FALSE;
int32_t value=0;
int32_t type;
if(unitIndex==elements[start].getStringLength(strings)) {
// An intermediate or final value.
value=elements[start++].getValue();
if(start==limit) {
writeValueAndFinal(value, TRUE); // final-value node
return;
}
hasValue=TRUE;
}
// Now all [start..limit[ strings are longer than unitIndex.
const UCharTrieElement &minElement=elements[start];
const UCharTrieElement &maxElement=elements[limit-1];
int32_t minUnit=minElement.charAt(unitIndex, strings);
int32_t maxUnit=maxElement.charAt(unitIndex, strings);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t minStringLength=minElement.getStringLength(strings);
int32_t lastUnitIndex=unitIndex;
while(++lastUnitIndex<minStringLength &&
minElement.charAt(lastUnitIndex, strings)==
maxElement.charAt(lastUnitIndex, strings)) {}
writeNode(start, limit, lastUnitIndex);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
const UChar *s=minElement.getString(strings).getBuffer();
int32_t length=lastUnitIndex-unitIndex;
while(length>UCharTrie::kMaxLinearMatchLength) {
lastUnitIndex-=UCharTrie::kMaxLinearMatchLength;
length-=UCharTrie::kMaxLinearMatchLength;
write(s+lastUnitIndex, UCharTrie::kMaxLinearMatchLength);
write(UCharTrie::kMinLinearMatch+UCharTrie::kMaxLinearMatchLength-1);
}
write(s+unitIndex, length);
type=UCharTrie::kMinLinearMatch+length-1;
} else {
// Branch node.
int32_t length=0; // Number of different units at unitIndex.
int32_t i=start;
do {
UChar unit=elements[i++].charAt(unitIndex, strings);
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
++length;
} while(i<limit);
// length>=2 because minUnit!=maxUnit.
writeBranchSubNode(start, limit, unitIndex, length);
if(--length<UCharTrie::kMinLinearMatch) {
type=length;
} else {
write(length);
type=0;
}
}
writeValueAndType(hasValue, value, type);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
void
UCharTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
UChar middleUnits[16];
int32_t lessThan[16];
int32_t ltLength=0;
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t count=length/2;
int32_t i=start;
UChar unit;
do {
unit=elements[i++].charAt(unitIndex, strings);
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
} while(--count>0);
// Encode the less-than branch first.
unit=middleUnits[ltLength]=elements[i].charAt(unitIndex, strings); // middle unit
writeBranchSubNode(start, i, unitIndex, length/2);
lessThan[ltLength]=ucharsLength;
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t starts[UCharTrie::kMaxBranchLinearSubNodeLength];
UBool final[UCharTrie::kMaxBranchLinearSubNodeLength-1];
int32_t unitNumber=0;
do {
int32_t i=starts[unitNumber]=start;
UChar unit=elements[i++].charAt(unitIndex, strings);
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
final[unitNumber]= start==i-1 && unitIndex+1==elements[start].getStringLength(strings);
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
starts[unitNumber]=start;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t jumpTargets[UCharTrie::kMaxBranchLinearSubNodeLength-1];
do {
--unitNumber;
if(!final[unitNumber]) {
writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
jumpTargets[unitNumber]=ucharsLength;
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
writeNode(start, limit, unitIndex+1);
write(elements[start].charAt(unitIndex, strings));
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
start=starts[unitNumber];
int32_t value;
if(final[unitNumber]) {
// Write the final value for the one string ending with this unit.
value=elements[start].getValue();
} else {
// Write the delta to the start position of the sub-node.
value=ucharsLength-jumpTargets[unitNumber];
}
writeValueAndFinal(value, final[unitNumber]);
write(elements[start].charAt(unitIndex, strings));
}
// Write the split-branch nodes.
while(ltLength>0) {
--ltLength;
writeDelta(ucharsLength-lessThan[ltLength]); // less-than
write(middleUnits[ltLength]);
}
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
DictTrieBuilder::Node *
UCharTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UBool hasValue=FALSE;
int32_t value=0;
if(unitIndex==elements[start].getStringLength(strings)) {
// An intermediate or final value.
value=elements[start++].getValue();
if(start==limit) {
return registerFinalValue(value, errorCode);
}
hasValue=TRUE;
}
ValueNode *node;
// Now all [start..limit[ strings are longer than unitIndex.
const UCharTrieElement &minElement=elements[start];
const UCharTrieElement &maxElement=elements[limit-1];
int32_t minUnit=minElement.charAt(unitIndex, strings);
int32_t maxUnit=maxElement.charAt(unitIndex, strings);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t minStringLength=minElement.getStringLength(strings);
int32_t lastUnitIndex=unitIndex;
while(++lastUnitIndex<minStringLength &&
minElement.charAt(lastUnitIndex, strings)==
maxElement.charAt(lastUnitIndex, strings)) {}
Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
const UChar *s=minElement.getString(strings).getBuffer();
int32_t length=lastUnitIndex-unitIndex;
while(length>UCharTrie::kMaxLinearMatchLength) {
lastUnitIndex-=UCharTrie::kMaxLinearMatchLength;
length-=UCharTrie::kMaxLinearMatchLength;
node=new UCTLinearMatchNode(
s+lastUnitIndex,
UCharTrie::kMaxLinearMatchLength,
nextNode);
node=(ValueNode *)registerNode(node, errorCode);
nextNode=node;
}
node=new UCTLinearMatchNode(s+unitIndex, length, nextNode);
} else {
// Branch node.
int32_t length=0; // Number of different units at unitIndex.
int32_t i=start;
do {
UChar unit=elements[i++].charAt(unitIndex, strings);
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
++length;
} while(i<limit);
// length>=2 because minUnit!=maxUnit.
Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
node=new UCTBranchHeadNode(length, subNode);
}
if(hasValue && node!=NULL) {
node->setValue(value);
}
return registerNode(node, errorCode);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
DictTrieBuilder::Node *
UCharTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UChar middleUnits[16];
Node *lessThan[16];
int32_t ltLength=0;
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t count=length/2;
int32_t i=start;
UChar unit;
do {
unit=elements[i++].charAt(unitIndex, strings);
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
} while(--count>0);
// Create the less-than branch.
unit=middleUnits[ltLength]=elements[i].charAt(unitIndex, strings); // middle unit
lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
if(U_FAILURE(errorCode)) {
return NULL;
}
UCTListBranchNode *listNode=new UCTListBranchNode();
if(listNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t unitNumber=0;
do {
int32_t i=start;
UChar unit=elements[i++].charAt(unitIndex, strings);
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
if(start==i-1 && unitIndex+1==elements[start].getStringLength(strings)) {
listNode->add(unit, elements[start].getValue());
} else {
listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
}
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
UChar unit=elements[start].charAt(unitIndex, strings);
if(start==limit-1 && unitIndex+1==elements[start].getStringLength(strings)) {
listNode->add(unit, elements[start].getValue());
} else {
listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
}
Node *node=registerNode(listNode, errorCode);
// Create the split-branch nodes.
while(ltLength>0) {
--ltLength;
node=registerNode(
new UCTSplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
}
return node;
}
void
UCharTrieBuilder::UCTFinalValueNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
offset=b.writeValueAndFinal(value, TRUE);
}
UCharTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
: LinearMatchNode(len, nextNode), s(units) {
hash=hash*37+uhash_hashUCharsN(units, len);
}
UBool
UCharTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!LinearMatchNode::operator==(other)) {
return FALSE;
}
const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
return 0==u_memcmp(s, o.s, length);
}
void
UCharTrieBuilder::UCTLinearMatchNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
next->write(builder);
b.write(s, length);
offset=b.writeValueAndType(hasValue, value, UCharTrie::kMinLinearMatch+length-1);
}
void
UCharTrieBuilder::UCTListBranchNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t unitNumber=length-1;
Node *rightEdge=equal[unitNumber];
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
do {
--unitNumber;
if(equal[unitNumber]!=NULL) {
equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
if(rightEdge==NULL) {
b.writeValueAndFinal(values[unitNumber], TRUE);
} else {
rightEdge->write(builder);
}
b.write(units[unitNumber]);
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
int32_t value;
UBool isFinal;
if(equal[unitNumber]==NULL) {
// Write the final value for the one string ending with this unit.
value=values[unitNumber];
isFinal=TRUE;
} else {
// Write the delta to the start position of the sub-node.
U_ASSERT(equal[unitNumber]->getOffset()>0);
value=b.ucharsLength-equal[unitNumber]->getOffset();
isFinal=FALSE;
}
b.writeValueAndFinal(value, isFinal);
offset=b.write(units[unitNumber]);
}
}
void
UCharTrieBuilder::UCTSplitBranchNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
// Encode the less-than branch first.
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
// Encode the greater-or-equal branch last because we do not jump for it at all.
greaterOrEqual->write(builder);
// Write this node.
U_ASSERT(lessThan->getOffset()>0);
b.writeDelta(b.ucharsLength-lessThan->getOffset()); // less-than
offset=b.write(unit);
}
void
UCharTrieBuilder::UCTBranchHeadNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
next->write(builder);
if(length<=UCharTrie::kMinLinearMatch) {
offset=b.writeValueAndType(hasValue, value, length-1);
} else {
b.write(length-1);
offset=b.writeValueAndType(hasValue, value, 0);
}
}
UBool
UCharTrieBuilder::ensureCapacity(int32_t length) {
if(uchars==NULL) {
return FALSE; // previous memory allocation had failed
}
if(length>ucharsCapacity) {
int32_t newCapacity=ucharsCapacity;
do {
newCapacity*=2;
} while(newCapacity<=length);
UChar *newUChars=reinterpret_cast<UChar *>(uprv_malloc(newCapacity*2));
if(newUChars==NULL) {
// unable to allocate memory
uprv_free(uchars);
uchars=NULL;
return FALSE;
}
u_memcpy(newUChars+(newCapacity-ucharsLength),
uchars+(ucharsCapacity-ucharsLength), ucharsLength);
uprv_free(uchars);
uchars=newUChars;
ucharsCapacity=newCapacity;
}
return TRUE;
}
int32_t
UCharTrieBuilder::write(int32_t unit) {
int32_t newLength=ucharsLength+1;
if(ensureCapacity(newLength)) {
ucharsLength=newLength;
uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
}
return ucharsLength;
}
int32_t
UCharTrieBuilder::write(const UChar *s, int32_t length) {
int32_t newLength=ucharsLength+length;
if(ensureCapacity(newLength)) {
ucharsLength=newLength;
u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
}
return ucharsLength;
}
int32_t
UCharTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
UChar intUnits[3];
int32_t length;
if(i<0 || i>UCharTrie::kMaxTwoUnitValue) {
intUnits[0]=(UChar)(UCharTrie::kThreeUnitValueLead);
intUnits[1]=(UChar)(i>>16);
intUnits[2]=(UChar)i;
length=3;
} else if(i<=UCharTrie::kMaxOneUnitValue) {
intUnits[0]=(UChar)(i);
length=1;
} else {
intUnits[0]=(UChar)(UCharTrie::kMinTwoUnitValueLead+(i>>16));
intUnits[1]=(UChar)i;
length=2;
}
intUnits[0]=(UChar)(intUnits[0]|(final<<15));
return write(intUnits, length);
}
int32_t
UCharTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
if(!hasValue) {
return write(node);
}
UChar intUnits[3];
int32_t length;
if(value<0 || value>UCharTrie::kMaxTwoUnitNodeValue) {
intUnits[0]=(UChar)(UCharTrie::kThreeUnitNodeValueLead);
intUnits[1]=(UChar)(value>>16);
intUnits[2]=(UChar)value;
length=3;
} else if(value<=UCharTrie::kMaxOneUnitNodeValue) {
intUnits[0]=(UChar)((value+1)<<6);
length=1;
} else {
intUnits[0]=(UChar)(UCharTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
intUnits[1]=(UChar)value;
length=2;
}
intUnits[0]|=(UChar)node;
return write(intUnits, length);
}
int32_t
UCharTrieBuilder::writeDelta(int32_t i) {
UChar intUnits[3];
int32_t length;
U_ASSERT(i>=0);
if(i<=UCharTrie::kMaxOneUnitDelta) {
length=0;
} else if(i<=UCharTrie::kMaxTwoUnitDelta) {
intUnits[0]=(UChar)(UCharTrie::kMinTwoUnitDeltaLead+(i>>16));
length=1;
} else {
intUnits[0]=(UChar)(UCharTrie::kThreeUnitDeltaLead);
intUnits[1]=(UChar)(i>>16);
length=2;
}
intUnits[length++]=(UChar)i;
return write(intUnits, length);
}
U_NAMESPACE_END

View file

@ -0,0 +1,112 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uchartriebuilder.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov14
* created by: Markus W. Scherer
*
* Builder class for UCharTrie dictionary trie.
*/
#ifndef __UCHARTRIEBUILDER_H__
#define __UCHARTRIEBUILDER_H__
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "dicttriebuilder.h"
U_NAMESPACE_BEGIN
class UCharTrieElement;
class U_TOOLUTIL_API UCharTrieBuilder : public DictTrieBuilder {
public:
UCharTrieBuilder()
: elements(NULL), elementsCapacity(0), elementsLength(0),
uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
~UCharTrieBuilder();
UCharTrieBuilder &add(const UnicodeString &s, int32_t value, UErrorCode &errorCode);
UnicodeString &build(UDictTrieBuildOption buildOption, UnicodeString &result, UErrorCode &errorCode);
UCharTrieBuilder &clear() {
strings.remove();
elementsLength=0;
ucharsLength=0;
return *this;
}
private:
void writeNode(int32_t start, int32_t limit, int32_t unitIndex);
void writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode);
UBool ensureCapacity(int32_t length);
int32_t write(int32_t unit);
int32_t write(const UChar *s, int32_t length);
int32_t writeValueAndFinal(int32_t i, UBool final);
int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
int32_t writeDelta(int32_t i);
// Compacting builder.
class UCTFinalValueNode : public FinalValueNode {
public:
UCTFinalValueNode(int32_t v) : FinalValueNode(v) {}
virtual void write(DictTrieBuilder &builder);
};
class UCTLinearMatchNode : public LinearMatchNode {
public:
UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode);
virtual UBool operator==(const Node &other) const;
virtual void write(DictTrieBuilder &builder);
private:
const UChar *s;
};
class UCTListBranchNode : public ListBranchNode {
public:
UCTListBranchNode() : ListBranchNode() {}
virtual void write(DictTrieBuilder &builder);
};
class UCTSplitBranchNode : public SplitBranchNode {
public:
UCTSplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
: SplitBranchNode(middleUnit, lessThanNode, greaterOrEqualNode) {}
virtual void write(DictTrieBuilder &builder);
};
class UCTBranchHeadNode : public BranchHeadNode {
public:
UCTBranchHeadNode(int32_t len, Node *subNode) : BranchHeadNode(len, subNode) {}
virtual void write(DictTrieBuilder &builder);
};
virtual Node *createFinalValueNode(int32_t value) const { return new UCTFinalValueNode(value); }
UnicodeString strings;
UCharTrieElement *elements;
int32_t elementsCapacity;
int32_t elementsLength;
// UChar serialization of the trie.
// Grows from the back: ucharsLength measures from the end of the buffer!
UChar *uchars;
int32_t ucharsCapacity;
int32_t ucharsLength;
};
U_NAMESPACE_END
#endif // __UCHARTRIEBUILDER_H__

View file

@ -0,0 +1,181 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uchartrieiterator.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov15
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "uchartrie.h"
#include "uchartrieiterator.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
UCharTrieIterator::UCharTrieIterator(const UChar *trieUChars, int32_t maxStringLength,
UErrorCode &errorCode)
: uchars_(trieUChars),
pos_(uchars_), initialPos_(uchars_),
remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
skipValue_(FALSE),
maxLength_(maxStringLength), value_(0), stack_(errorCode) {}
UCharTrieIterator::UCharTrieIterator(const UCharTrie &trie, int32_t maxStringLength,
UErrorCode &errorCode)
: uchars_(trie.uchars_), pos_(trie.pos_), initialPos_(trie.pos_),
remainingMatchLength_(trie.remainingMatchLength_),
initialRemainingMatchLength_(trie.remainingMatchLength_),
skipValue_(FALSE),
maxLength_(maxStringLength), value_(0), stack_(errorCode) {
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining UChars to str.
++length;
if(maxLength_>0 && length>maxLength_) {
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
}
str_.append(pos_, length);
pos_+=length;
remainingMatchLength_-=length;
}
}
UCharTrieIterator &UCharTrieIterator::reset() {
pos_=initialPos_;
remainingMatchLength_=initialRemainingMatchLength_;
skipValue_=FALSE;
int32_t length=remainingMatchLength_+1; // Remaining match length.
if(maxLength_>0 && length>maxLength_) {
length=maxLength_;
}
str_.truncate(length);
pos_+=length;
remainingMatchLength_-=length;
stack_.setSize(0);
return *this;
}
UBool
UCharTrieIterator::next(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return FALSE;
}
const UChar *pos=pos_;
if(pos==NULL) {
if(stack_.isEmpty()) {
return FALSE;
}
// Pop the state off the stack and continue with the next outbound edge of
// the branch node.
int32_t stackSize=stack_.size();
int32_t length=stack_.elementAti(stackSize-1);
pos=uchars_+stack_.elementAti(stackSize-2);
stack_.setSize(stackSize-2);
str_.truncate(length&0xffff);
length=(int32_t)((uint32_t)length>>16);
if(length>1) {
pos=branchNext(pos, length, errorCode);
if(pos==NULL) {
return TRUE; // Reached a final value.
}
} else {
str_.append(*pos++);
}
}
if(remainingMatchLength_>=0) {
// We only get here if we started in a pending linear-match node
// with more than maxLength remaining units.
return truncateAndStop();
}
for(;;) {
int32_t node=*pos++;
if(node>=UCharTrie::kMinValueLead) {
if(skipValue_) {
pos=UCharTrie::skipNodeValue(pos, node);
node&=UCharTrie::kNodeTypeMask;
skipValue_=FALSE;
} else {
// Deliver value for the string so far.
UBool isFinal=(UBool)(node>>15);
if(isFinal) {
value_=UCharTrie::readValue(pos, node&0x7fff);
} else {
value_=UCharTrie::readNodeValue(pos, node);
}
if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
pos_=NULL;
} else {
// We cannot skip the value right here because it shares its
// lead unit with a match node which we have to evaluate
// next time.
// Instead, keep pos_ on the node lead unit itself.
pos_=pos-1;
skipValue_=TRUE;
}
return TRUE;
}
}
if(maxLength_>0 && str_.length()==maxLength_) {
return truncateAndStop();
}
if(node<UCharTrie::kMinLinearMatch) {
if(node==0) {
node=*pos++;
}
pos=branchNext(pos, node+1, errorCode);
if(pos==NULL) {
return TRUE; // Reached a final value.
}
} else {
// Linear-match node, append length units to str_.
int32_t length=node-UCharTrie::kMinLinearMatch+1;
if(maxLength_>0 && str_.length()+length>maxLength_) {
str_.append(pos, maxLength_-str_.length());
return truncateAndStop();
}
str_.append(pos, length);
pos+=length;
}
}
}
// Branch node, needs to take the first outbound edge and push state for the rest.
const UChar *
UCharTrieIterator::branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode) {
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
// Push state for the greater-or-equal edge.
stack_.addElement((int32_t)(UCharTrie::skipDelta(pos)-uchars_), errorCode);
stack_.addElement(((length-(length>>1))<<16)|str_.length(), errorCode);
// Follow the less-than edge.
length>>=1;
pos=UCharTrie::jumpByDelta(pos);
}
// List of key-value pairs where values are either final values or jump deltas.
// Read the first (key, value) pair.
UChar trieUnit=*pos++;
int32_t node=*pos++;
UBool isFinal=(UBool)(node>>15);
int32_t value=UCharTrie::readValue(pos, node&=0x7fff);
pos=UCharTrie::skipValue(pos, node);
stack_.addElement((int32_t)(pos-uchars_), errorCode);
stack_.addElement(((length-1)<<16)|str_.length(), errorCode);
str_.append(trieUnit);
if(isFinal) {
pos_=NULL;
value_=value;
return NULL;
} else {
return pos+value;
}
}
U_NAMESPACE_END

View file

@ -0,0 +1,121 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uchartrieiterator.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov15
* created by: Markus W. Scherer
*/
#ifndef __UCHARTRIEITERATOR_H__
#define __UCHARTRIEITERATOR_H__
/**
* \file
* \brief C++ API: UCharTrie iterator for all of its (string, value) pairs.
*/
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "uchartrie.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
/**
* Iterator for all of the (string, value) pairs in a UCharTrie.
*/
class U_TOOLUTIL_API UCharTrieIterator : public UMemory {
public:
/**
* Iterates from the root of a UChar-serialized UCharTrie.
* @param trieUChars The trie UChars.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
UCharTrieIterator(const UChar *trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Iterates from the current state of the specified UCharTrie.
* @param trie The trie whose state will be copied for iteration.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
UCharTrieIterator(const UCharTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Resets this iterator to its initial state.
*/
UCharTrieIterator &reset();
/**
* Finds the next (string, value) pair if there is one.
*
* If the string is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return TRUE if there is another element.
*/
UBool next(UErrorCode &errorCode);
/**
* @return TRUE if there are more elements.
*/
UBool hasNext() const { return pos_!=NULL || !stack_.isEmpty(); }
/**
* @return the NUL-terminated string for the last successful next()
*/
const UnicodeString &getString() const { return str_; }
/**
* @return the value for the last successful next()
*/
int32_t getValue() const { return value_; }
private:
UBool truncateAndStop() {
pos_=NULL;
value_=-1; // no real value for str
return TRUE;
}
const UChar *branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode);
const UChar *uchars_;
const UChar *pos_;
const UChar *initialPos_;
int32_t remainingMatchLength_;
int32_t initialRemainingMatchLength_;
UBool skipValue_; // Skip intermediate value which was already delivered.
UnicodeString str_;
int32_t maxLength_;
int32_t value_;
// The stack stores pairs of integers for backtracking to another
// outbound edge of a branch node.
// The first integer is an offset from ByteTrie.bytes.
// The second integer has the str.length() from before the node in bits 15..0,
// and the remaining branch length in bits 31..16.
// (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
// but the code looks more confusing that way.)
UVector32 stack_;
};
U_NAMESPACE_END
#endif // __UCHARTRIEITERATOR_H__

View file

@ -216,3 +216,42 @@ usrc_writeUTrie2Struct(FILE *f,
fputs(postfix, f);
}
}
U_CAPI void U_EXPORT2
usrc_writeArrayOfMostlyInvChars(FILE *f,
const char *prefix,
const char *p, int32_t length,
const char *postfix) {
int32_t i, col;
int prev2, prev, c;
if(prefix!=NULL) {
fprintf(f, prefix, (long)length);
}
prev2=prev=-1;
for(i=col=0; i<length; ++i, ++col) {
c=(uint8_t)p[i];
if(i>0) {
/* Break long lines. Try to break at interesting places, to minimize revision diffs. */
if(
/* Very long line. */
col>=32 ||
/* Long line, break after terminating NUL. */
(col>=24 && prev2>=0x20 && prev==0) ||
/* Medium-long line, break before non-NUL, non-character byte. */
(col>=16 && (prev==0 || prev>=0x20) && 0<c && c<0x20)
) {
fputs(",\n", f);
col=0;
} else {
fputc(',', f);
}
}
fprintf(f, c<0x20 ? "%u" : "'%c'", c);
prev2=prev;
prev=c;
}
if(postfix!=NULL) {
fputs(postfix, f);
}
}

View file

@ -24,21 +24,21 @@
#include "utrie2.h"
/**
* Create a source text file and write a header comment with the ICU copyright.
* Creates a source text file and write a header comment with the ICU copyright.
* Writes a C/Java-style comment.
*/
U_CAPI FILE * U_EXPORT2
usrc_create(const char *path, const char *filename);
/**
* Create a source text file and write a header comment with the ICU copyright.
* Creates a source text file and write a header comment with the ICU copyright.
* Writes the comment with # lines, as used in scripts and text data.
*/
U_CAPI FILE * U_EXPORT2
usrc_createTextData(const char *path, const char *filename);
/**
* Write the contents of an array of 8/16/32-bit words.
* Writes the contents of an array of 8/16/32-bit words.
* The prefix and postfix are optional (can be NULL) and are written first/last.
* The prefix may contain a %ld or similar field for the array length.
* The {} and declaration etc. need to be included in prefix/postfix or
@ -73,4 +73,20 @@ usrc_writeUTrie2Struct(FILE *f,
const char *indexName, const char *dataName,
const char *postfix);
/**
* Writes the contents of an array of mostly invariant characters.
* Characters 0..0x1f are printed as numbers,
* others as characters with single quotes: '%c'.
*
* The prefix and postfix are optional (can be NULL) and are written first/last.
* The prefix may contain a %ld or similar field for the array length.
* The {} and declaration etc. need to be included in prefix/postfix or
* printed before and after the array contents.
*/
U_CAPI void U_EXPORT2
usrc_writeArrayOfMostlyInvChars(FILE *f,
const char *prefix,
const char *p, int32_t length,
const char *postfix);
#endif