mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-12507 ICU4C RBBI, switch to UTrie2
X-SVN-Rev: 40105
This commit is contained in:
parent
b10a17be24
commit
a3a2b57516
6 changed files with 41 additions and 121 deletions
|
@ -1078,7 +1078,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
|||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
|
@ -1275,7 +1275,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
|
@ -1510,26 +1510,6 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*
|
|||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// isDictionaryChar Return true if the category lookup for this char
|
||||
// indicates that it is in the set of dictionary lookup
|
||||
// chars.
|
||||
//
|
||||
// This function is intended for use by dictionary based
|
||||
// break iterators.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
|
||||
if (fData == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
uint16_t category;
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
return (category & 0x4000) != 0;
|
||||
}*/
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// checkDictionary This function handles all processing of characters in
|
||||
|
@ -1569,7 +1549,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
int32_t foundBreakCount = 0;
|
||||
UChar32 c = utext_current32(fText);
|
||||
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
|
||||
// Is the character we're starting on a dictionary character? If so, we
|
||||
// need to back up to include the entire run; otherwise the results of
|
||||
|
@ -1581,7 +1561,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
do {
|
||||
utext_next32(fText); // TODO: recast to work directly with postincrement.
|
||||
c = utext_current32(fText);
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
} while (c != U_SENTINEL && (category & 0x4000));
|
||||
// Back up to the last dictionary character
|
||||
rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
|
||||
|
@ -1597,7 +1577,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
else {
|
||||
do {
|
||||
c = UTEXT_PREVIOUS32(fText);
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
}
|
||||
while (c != U_SENTINEL && (category & 0x4000));
|
||||
// Back up to the last dictionary character
|
||||
|
@ -1611,7 +1591,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
}
|
||||
rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
|
||||
}
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
}
|
||||
|
||||
// Loop through the text, looking for ranges of dictionary characters.
|
||||
|
@ -1622,13 +1602,13 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
if (reverse) {
|
||||
utext_setNativeIndex(fText, rangeStart);
|
||||
c = utext_current32(fText);
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
}
|
||||
while(U_SUCCESS(status)) {
|
||||
while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
|
||||
utext_next32(fText); // TODO: tweak for post-increment operation
|
||||
c = utext_current32(fText);
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
}
|
||||
if (current >= rangeEnd) {
|
||||
break;
|
||||
|
@ -1646,7 +1626,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
|
|||
|
||||
// Reload the loop variables for the next go-round
|
||||
c = utext_current32(fText);
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
}
|
||||
|
||||
// If we found breaks, build a new break cache. The first and last entries must
|
||||
|
|
|
@ -23,23 +23,6 @@
|
|||
#include "uassert.h"
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// Trie access folding function. Copied as-is from properties code in uchar.c
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
U_CDECL_BEGIN
|
||||
static int32_t U_CALLCONV
|
||||
getFoldingOffset(uint32_t data) {
|
||||
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
|
||||
if(data&0x8000) {
|
||||
return (int32_t)(data&0x7fff);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -98,6 +81,7 @@ void RBBIDataWrapper::init0() {
|
|||
fSafeRevTable = NULL;
|
||||
fRuleSource = NULL;
|
||||
fRuleStatusTable = NULL;
|
||||
fTrie = NULL;
|
||||
fUDataMem = NULL;
|
||||
fRefCount = 0;
|
||||
fDontFreeData = TRUE;
|
||||
|
@ -132,15 +116,14 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
|||
}
|
||||
|
||||
|
||||
utrie_unserialize(&fTrie,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
fHeader->fTrieLen,
|
||||
&status);
|
||||
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
fHeader->fTrieLen,
|
||||
NULL, // *actual length
|
||||
&status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fTrie.getFoldingOffset=getFoldingOffset;
|
||||
|
||||
|
||||
fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
|
||||
fRuleString.setTo(TRUE, fRuleSource, -1);
|
||||
|
@ -165,6 +148,8 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
|||
//-----------------------------------------------------------------------------
|
||||
RBBIDataWrapper::~RBBIDataWrapper() {
|
||||
U_ASSERT(fRefCount == 0);
|
||||
utrie2_close(fTrie);
|
||||
fTrie = NULL;
|
||||
if (fUDataMem) {
|
||||
udata_close(fUDataMem);
|
||||
} else if (!fDontFreeData) {
|
||||
|
@ -451,8 +436,8 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
|
|||
}
|
||||
|
||||
// Trie table for character categories
|
||||
utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
|
||||
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
|
||||
utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
|
||||
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
|
||||
|
||||
// Source Rules Text. It's UChar data
|
||||
ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
|
||||
|
|
|
@ -52,7 +52,7 @@ ubrk_swap(const UDataSwapper *ds,
|
|||
#include "unicode/uobject.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "umutex.h"
|
||||
#include "utrie.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -181,7 +181,7 @@ public:
|
|||
/* number of int32_t values in the rule status table. Used to sanity check indexing */
|
||||
int32_t fStatusMaxIdx;
|
||||
|
||||
UTrie fTrie;
|
||||
UTrie2 *fTrie;
|
||||
|
||||
private:
|
||||
u_atomic_int32_t fRefCount;
|
||||
|
|
|
@ -35,7 +35,7 @@
|
|||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "utrie.h"
|
||||
#include "utrie2.h"
|
||||
#include "uvector.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -44,43 +44,6 @@
|
|||
#include "rbbisetb.h"
|
||||
#include "rbbinode.h"
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// getFoldedRBBIValue Call-back function used during building of Trie table.
|
||||
// Folding value: just store the offset (16 bits)
|
||||
// if there is any non-0 entry.
|
||||
// (It'd really be nice if the Trie builder would provide a
|
||||
// simple default, so this function could go away from here.)
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
|
||||
U_CDECL_BEGIN
|
||||
static uint32_t U_CALLCONV
|
||||
getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t value;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else if(value!=0) {
|
||||
return (uint32_t)(offset|0x8000);
|
||||
} else {
|
||||
++start;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
|
@ -116,7 +79,7 @@ RBBISetBuilder::~RBBISetBuilder()
|
|||
delete r;
|
||||
}
|
||||
|
||||
utrie_close(fTrie);
|
||||
utrie2_close(fTrie);
|
||||
}
|
||||
|
||||
|
||||
|
@ -287,33 +250,30 @@ void RBBISetBuilder::build() {
|
|||
// Build the Trie table for mapping UChar32 values to the corresponding
|
||||
// range group number
|
||||
//
|
||||
fTrie = utrie_open(NULL, // Pre-existing trie to be filled in
|
||||
NULL, // Data array (utrie will allocate one)
|
||||
100000, // Max Data Length
|
||||
0, // Initial value for all code points
|
||||
0, // Lead surrogate unit value
|
||||
TRUE); // Keep Latin 1 in separately
|
||||
|
||||
fTrie = utrie2_open(0, // Initial value for all code points
|
||||
0, // errorValue
|
||||
fStatus);
|
||||
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
|
||||
utrie2_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar, rlRange->fNum, TRUE, fStatus);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// getTrieSize() Return the size that will be required to serialize the Trie.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RBBISetBuilder::getTrieSize() /*const*/ {
|
||||
fTrieSize = utrie_serialize(fTrie,
|
||||
NULL, // Buffer
|
||||
0, // Capacity
|
||||
getFoldedRBBIValue,
|
||||
TRUE, // Reduce to 16 bits
|
||||
fStatus);
|
||||
utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
|
||||
fTrieSize = utrie2_serialize(fTrie,
|
||||
NULL, // Buffer
|
||||
0, // Capacity
|
||||
fStatus);
|
||||
if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*fStatus = U_ZERO_ERROR;
|
||||
}
|
||||
// RBBIDebugPrintf("Trie table size is %d\n", trieSize);
|
||||
return fTrieSize;
|
||||
}
|
||||
|
@ -327,12 +287,10 @@ int32_t RBBISetBuilder::getTrieSize() /*const*/ {
|
|||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
void RBBISetBuilder::serializeTrie(uint8_t *where) {
|
||||
utrie_serialize(fTrie,
|
||||
where, // Buffer
|
||||
fTrieSize, // Capacity
|
||||
getFoldedRBBIValue,
|
||||
TRUE, // Reduce to 16 bits
|
||||
fStatus);
|
||||
utrie2_serialize(fTrie,
|
||||
where, // Buffer
|
||||
fTrieSize, // Capacity
|
||||
fStatus);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
|
|
|
@ -15,10 +15,9 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "rbbirb.h"
|
||||
#include "utrie2.h"
|
||||
#include "uvector.h"
|
||||
|
||||
struct UNewTrie;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
//
|
||||
|
@ -109,7 +108,7 @@ private:
|
|||
|
||||
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
|
||||
|
||||
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
|
||||
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
|
||||
uint32_t fTrieSize; // the Unicode Sets.
|
||||
|
||||
// Groups correspond to character categories -
|
||||
|
|
|
@ -32,8 +32,6 @@
|
|||
#include "unicode/uchriter.h"
|
||||
|
||||
|
||||
struct UTrie;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/** @internal */
|
||||
|
|
Loading…
Add table
Reference in a new issue