ICU-12507 ICU4C RBBI, switch to UTrie2

X-SVN-Rev: 40105
This commit is contained in:
Andy Heninger 2017-05-03 23:44:14 +00:00
parent b10a17be24
commit a3a2b57516
6 changed files with 41 additions and 121 deletions

View file

@ -1078,7 +1078,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
@ -1275,7 +1275,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
@ -1510,26 +1510,6 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*
}
//-------------------------------------------------------------------------------
//
// isDictionaryChar Return true if the category lookup for this char
// indicates that it is in the set of dictionary lookup
// chars.
//
// This function is intended for use by dictionary based
// break iterators.
//
//-------------------------------------------------------------------------------
/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
if (fData == NULL) {
return FALSE;
}
uint16_t category;
UTRIE_GET16(&fData->fTrie, c, category);
return (category & 0x4000) != 0;
}*/
//-------------------------------------------------------------------------------
//
// checkDictionary This function handles all processing of characters in
@ -1569,7 +1549,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
int32_t foundBreakCount = 0;
UChar32 c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
// Is the character we're starting on a dictionary character? If so, we
// need to back up to include the entire run; otherwise the results of
@ -1581,7 +1561,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
do {
utext_next32(fText); // TODO: recast to work directly with postincrement.
c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
} while (c != U_SENTINEL && (category & 0x4000));
// Back up to the last dictionary character
rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
@ -1597,7 +1577,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
else {
do {
c = UTEXT_PREVIOUS32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
while (c != U_SENTINEL && (category & 0x4000));
// Back up to the last dictionary character
@ -1611,7 +1591,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
}
rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
}
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
// Loop through the text, looking for ranges of dictionary characters.
@ -1622,13 +1602,13 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
if (reverse) {
utext_setNativeIndex(fText, rangeStart);
c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
while(U_SUCCESS(status)) {
while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
utext_next32(fText); // TODO: tweak for post-increment operation
c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
if (current >= rangeEnd) {
break;
@ -1646,7 +1626,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
// Reload the loop variables for the next go-round
c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
// If we found breaks, build a new break cache. The first and last entries must

View file

@ -23,23 +23,6 @@
#include "uassert.h"
//-----------------------------------------------------------------------------------
//
// Trie access folding function. Copied as-is from properties code in uchar.c
//
//-----------------------------------------------------------------------------------
U_CDECL_BEGIN
static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
U_CDECL_END
U_NAMESPACE_BEGIN
//-----------------------------------------------------------------------------
@ -98,6 +81,7 @@ void RBBIDataWrapper::init0() {
fSafeRevTable = NULL;
fRuleSource = NULL;
fRuleStatusTable = NULL;
fTrie = NULL;
fUDataMem = NULL;
fRefCount = 0;
fDontFreeData = TRUE;
@ -132,15 +116,14 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
}
utrie_unserialize(&fTrie,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
&status);
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
NULL, // *actual length
&status);
if (U_FAILURE(status)) {
return;
}
fTrie.getFoldingOffset=getFoldingOffset;
fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
fRuleString.setTo(TRUE, fRuleSource, -1);
@ -165,6 +148,8 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
//-----------------------------------------------------------------------------
RBBIDataWrapper::~RBBIDataWrapper() {
U_ASSERT(fRefCount == 0);
utrie2_close(fTrie);
fTrie = NULL;
if (fUDataMem) {
udata_close(fUDataMem);
} else if (!fDontFreeData) {
@ -451,8 +436,8 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
}
// Trie table for character categories
utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
// Source Rules Text. It's UChar data
ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),

View file

@ -52,7 +52,7 @@ ubrk_swap(const UDataSwapper *ds,
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "umutex.h"
#include "utrie.h"
#include "utrie2.h"
U_NAMESPACE_BEGIN
@ -181,7 +181,7 @@ public:
/* number of int32_t values in the rule status table. Used to sanity check indexing */
int32_t fStatusMaxIdx;
UTrie fTrie;
UTrie2 *fTrie;
private:
u_atomic_int32_t fRefCount;

View file

@ -35,7 +35,7 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uniset.h"
#include "utrie.h"
#include "utrie2.h"
#include "uvector.h"
#include "uassert.h"
#include "cmemory.h"
@ -44,43 +44,6 @@
#include "rbbisetb.h"
#include "rbbinode.h"
//------------------------------------------------------------------------
//
// getFoldedRBBIValue Call-back function used during building of Trie table.
// Folding value: just store the offset (16 bits)
// if there is any non-0 entry.
// (It'd really be nice if the Trie builder would provide a
// simple default, so this function could go away from here.)
//
//------------------------------------------------------------------------
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
U_CDECL_BEGIN
static uint32_t U_CALLCONV
getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit;
UBool inBlockZero;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else if(value!=0) {
return (uint32_t)(offset|0x8000);
} else {
++start;
}
}
return 0;
}
U_CDECL_END
U_NAMESPACE_BEGIN
//------------------------------------------------------------------------
@ -116,7 +79,7 @@ RBBISetBuilder::~RBBISetBuilder()
delete r;
}
utrie_close(fTrie);
utrie2_close(fTrie);
}
@ -287,33 +250,30 @@ void RBBISetBuilder::build() {
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number
//
fTrie = utrie_open(NULL, // Pre-existing trie to be filled in
NULL, // Data array (utrie will allocate one)
100000, // Max Data Length
0, // Initial value for all code points
0, // Lead surrogate unit value
TRUE); // Keep Latin 1 in separately
fTrie = utrie2_open(0, // Initial value for all code points
0, // errorValue
fStatus);
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
utrie2_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar, rlRange->fNum, TRUE, fStatus);
}
}
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int32_t RBBISetBuilder::getTrieSize() /*const*/ {
fTrieSize = utrie_serialize(fTrie,
NULL, // Buffer
0, // Capacity
getFoldedRBBIValue,
TRUE, // Reduce to 16 bits
fStatus);
utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
fTrieSize = utrie2_serialize(fTrie,
NULL, // Buffer
0, // Capacity
fStatus);
if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
*fStatus = U_ZERO_ERROR;
}
// RBBIDebugPrintf("Trie table size is %d\n", trieSize);
return fTrieSize;
}
@ -327,12 +287,10 @@ int32_t RBBISetBuilder::getTrieSize() /*const*/ {
//
//-----------------------------------------------------------------------------------
void RBBISetBuilder::serializeTrie(uint8_t *where) {
utrie_serialize(fTrie,
where, // Buffer
fTrieSize, // Capacity
getFoldedRBBIValue,
TRUE, // Reduce to 16 bits
fStatus);
utrie2_serialize(fTrie,
where, // Buffer
fTrieSize, // Capacity
fStatus);
}
//------------------------------------------------------------------------

View file

@ -15,10 +15,9 @@
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "rbbirb.h"
#include "utrie2.h"
#include "uvector.h"
struct UNewTrie;
U_NAMESPACE_BEGIN
//
@ -109,7 +108,7 @@ private:
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
uint32_t fTrieSize; // the Unicode Sets.
// Groups correspond to character categories -

View file

@ -32,8 +32,6 @@
#include "unicode/uchriter.h"
struct UTrie;
U_NAMESPACE_BEGIN
/** @internal */