ICU-12507 rbbi, switch impl from UTrie to UTrie2.

X-SVN-Rev: 40270
This commit is contained in:
Andy Heninger 2017-07-19 22:31:12 +00:00
commit 1292197198
12 changed files with 138 additions and 277 deletions

View file

@ -1078,7 +1078,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
@ -1275,7 +1275,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
@ -1510,26 +1510,6 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*
}
//-------------------------------------------------------------------------------
//
// isDictionaryChar Return true if the category lookup for this char
// indicates that it is in the set of dictionary lookup
// chars.
//
// This function is intended for use by dictionary based
// break iterators.
//
//-------------------------------------------------------------------------------
/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
if (fData == NULL) {
return FALSE;
}
uint16_t category;
UTRIE_GET16(&fData->fTrie, c, category);
return (category & 0x4000) != 0;
}*/
//-------------------------------------------------------------------------------
//
// checkDictionary This function handles all processing of characters in
@ -1569,7 +1549,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
int32_t foundBreakCount = 0;
UChar32 c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
// Is the character we're starting on a dictionary character? If so, we
// need to back up to include the entire run; otherwise the results of
@ -1581,7 +1561,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
do {
utext_next32(fText); // TODO: recast to work directly with postincrement.
c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
} while (c != U_SENTINEL && (category & 0x4000));
// Back up to the last dictionary character
rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText);
@ -1597,7 +1577,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
else {
do {
c = UTEXT_PREVIOUS32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
while (c != U_SENTINEL && (category & 0x4000));
// Back up to the last dictionary character
@ -1611,7 +1591,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
}
rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);;
}
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
// Loop through the text, looking for ranges of dictionary characters.
@ -1622,13 +1602,13 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
if (reverse) {
utext_setNativeIndex(fText, rangeStart);
c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
while(U_SUCCESS(status)) {
while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) {
utext_next32(fText); // TODO: tweak for post-increment operation
c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
if (current >= rangeEnd) {
break;
@ -1646,7 +1626,7 @@ int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos,
// Reload the loop variables for the next go-round
c = utext_current32(fText);
UTRIE_GET16(&fData->fTrie, c, category);
category = UTRIE2_GET16(fData->fTrie, c);
}
// If we found breaks, build a new break cache. The first and last entries must

View file

@ -23,23 +23,6 @@
#include "uassert.h"
//-----------------------------------------------------------------------------------
//
// Trie access folding function. Copied as-is from properties code in uchar.c
//
//-----------------------------------------------------------------------------------
U_CDECL_BEGIN
static int32_t U_CALLCONV
getFoldingOffset(uint32_t data) {
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
U_CDECL_END
U_NAMESPACE_BEGIN
//-----------------------------------------------------------------------------
@ -71,9 +54,8 @@ RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
dh->info.dataFormat[0] == 0x42 && // dataFormat="Brk "
dh->info.dataFormat[1] == 0x72 &&
dh->info.dataFormat[2] == 0x6b &&
dh->info.dataFormat[3] == 0x20)
// Note: info.fFormatVersion is duplicated in the RBBIDataHeader, and is
// validated when checking that.
dh->info.dataFormat[3] == 0x20 &&
isDataVersionAcceptable(dh->info.formatVersion))
) {
status = U_INVALID_FORMAT_ERROR;
return;
@ -84,6 +66,11 @@ RBBIDataWrapper::RBBIDataWrapper(UDataMemory* udm, UErrorCode &status) {
fUDataMem = udm;
}
UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
return RBBI_DATA_FORMAT_VERSION[0] == version[0];
}
//-----------------------------------------------------------------------------
//
// init(). Does most of the work of construction, shared between the
@ -98,6 +85,7 @@ void RBBIDataWrapper::init0() {
fSafeRevTable = NULL;
fRuleSource = NULL;
fRuleStatusTable = NULL;
fTrie = NULL;
fUDataMem = NULL;
fRefCount = 0;
fDontFreeData = TRUE;
@ -108,8 +96,7 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
return;
}
fHeader = data;
if (fHeader->fMagic != 0xb1a0 || fHeader->fFormatVersion[0] != 3)
{
if (fHeader->fMagic != 0xb1a0 || !isDataVersionAcceptable(fHeader->fFormatVersion)) {
status = U_INVALID_FORMAT_ERROR;
return;
}
@ -132,15 +119,14 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
}
utrie_unserialize(&fTrie,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
&status);
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
NULL, // *actual length
&status);
if (U_FAILURE(status)) {
return;
}
fTrie.getFoldingOffset=getFoldingOffset;
fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
fRuleString.setTo(TRUE, fRuleSource, -1);
@ -165,6 +151,8 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
//-----------------------------------------------------------------------------
RBBIDataWrapper::~RBBIDataWrapper() {
U_ASSERT(fRefCount == 0);
utrie2_close(fTrie);
fTrie = NULL;
if (fUDataMem) {
udata_close(fUDataMem);
} else if (!fDontFreeData) {
@ -323,7 +311,7 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
pInfo->dataFormat[1]==0x72 &&
pInfo->dataFormat[2]==0x6b &&
pInfo->dataFormat[3]==0x20 &&
pInfo->formatVersion[0]==3 )) {
RBBIDataWrapper::isDataVersionAcceptable(pInfo->formatVersion) )) {
udata_printError(ds, "ubrk_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
pInfo->dataFormat[2], pInfo->dataFormat[3],
@ -344,17 +332,11 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
//
// Get the RRBI Data Header, and check that it appears to be OK.
//
// Note: ICU 3.2 and earlier, RBBIDataHeader::fDataFormat was actually
// an int32_t with a value of 1. Starting with ICU 3.4,
// RBBI's fDataFormat matches the dataFormat field from the
// UDataInfo header, four int8_t bytes. The value is {3,1,0,0}
//
const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
RBBIDataHeader *rbbiDH = (RBBIDataHeader *)inBytes;
if (ds->readUInt32(rbbiDH->fMagic) != 0xb1a0 ||
rbbiDH->fFormatVersion[0] != 3 ||
ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader))
{
!RBBIDataWrapper::isDataVersionAcceptable(rbbiDH->fFormatVersion) ||
ds->readUInt32(rbbiDH->fLength) < sizeof(RBBIDataHeader)) {
udata_printError(ds, "ubrk_swap(): RBBI Data header is invalid.\n");
*status=U_UNSUPPORTED_ERROR;
return 0;
@ -451,8 +433,8 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
}
// Trie table for character categories
utrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen),
outBytes+ds->readUInt32(rbbiDH->fTrie), status);
// Source Rules Text. It's UChar data
ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),

View file

@ -51,22 +51,23 @@ ubrk_swap(const UDataSwapper *ds,
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/uversion.h"
#include "umutex.h"
#include "utrie.h"
#include "utrie2.h"
U_NAMESPACE_BEGIN
// The current RBBI data format version.
static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {4, 0, 0, 0};
/*
* The following structs map exactly onto the raw data from ICU common data file.
*/
struct RBBIDataHeader {
uint32_t fMagic; /* == 0xbla0 */
uint8_t fFormatVersion[4]; /* Data Format. Same as the value in struct UDataInfo */
UVersionInfo fFormatVersion; /* Data Format. Same as the value in struct UDataInfo */
/* if there is one associated with this data. */
/* (version originates in rbbi, is copied to UDataInfo) */
/* For ICU 3.2 and earlier, this field was */
/* uint32_t fVersion */
/* with a value of 1. */
uint32_t fLength; /* Total length in bytes of this RBBI Data, */
/* including all sections, not just the header. */
uint32_t fCatCount; /* Number of character categories. */
@ -152,6 +153,8 @@ public:
RBBIDataWrapper(UDataMemory* udm, UErrorCode &status);
~RBBIDataWrapper();
static UBool isDataVersionAcceptable(const UVersionInfo version);
void init0();
void init(const RBBIDataHeader *data, UErrorCode &status);
RBBIDataWrapper *addReference();
@ -181,7 +184,7 @@ public:
/* number of int32_t values in the rule status table. Used to sanity check indexing */
int32_t fStatusMaxIdx;
UTrie fTrie;
UTrie2 *fTrie;
private:
u_atomic_int32_t fRefCount;

View file

@ -177,10 +177,10 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
data->fMagic = 0xb1a0;
data->fFormatVersion[0] = 3;
data->fFormatVersion[1] = 1;
data->fFormatVersion[2] = 0;
data->fFormatVersion[3] = 0;
data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
data->fLength = totalSize;
data->fCatCount = fSetBuilder->getNumCharCategories();

View file

@ -35,7 +35,7 @@
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/uniset.h"
#include "utrie.h"
#include "utrie2.h"
#include "uvector.h"
#include "uassert.h"
#include "cmemory.h"
@ -44,43 +44,6 @@
#include "rbbisetb.h"
#include "rbbinode.h"
//------------------------------------------------------------------------
//
// getFoldedRBBIValue Call-back function used during building of Trie table.
// Folding value: just store the offset (16 bits)
// if there is any non-0 entry.
// (It'd really be nice if the Trie builder would provide a
// simple default, so this function could go away from here.)
//
//------------------------------------------------------------------------
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
U_CDECL_BEGIN
static uint32_t U_CALLCONV
getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit;
UBool inBlockZero;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else if(value!=0) {
return (uint32_t)(offset|0x8000);
} else {
++start;
}
}
return 0;
}
U_CDECL_END
U_NAMESPACE_BEGIN
//------------------------------------------------------------------------
@ -116,7 +79,7 @@ RBBISetBuilder::~RBBISetBuilder()
delete r;
}
utrie_close(fTrie);
utrie2_close(fTrie);
}
@ -287,33 +250,30 @@ void RBBISetBuilder::build() {
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number
//
fTrie = utrie_open(NULL, // Pre-existing trie to be filled in
NULL, // Data array (utrie will allocate one)
100000, // Max Data Length
0, // Initial value for all code points
0, // Lead surrogate unit value
TRUE); // Keep Latin 1 in separately
fTrie = utrie2_open(0, // Initial value for all code points
0, // errorValue
fStatus);
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
utrie2_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar, rlRange->fNum, TRUE, fStatus);
}
}
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int32_t RBBISetBuilder::getTrieSize() /*const*/ {
fTrieSize = utrie_serialize(fTrie,
NULL, // Buffer
0, // Capacity
getFoldedRBBIValue,
TRUE, // Reduce to 16 bits
fStatus);
utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus);
fTrieSize = utrie2_serialize(fTrie,
NULL, // Buffer
0, // Capacity
fStatus);
if (*fStatus == U_BUFFER_OVERFLOW_ERROR) {
*fStatus = U_ZERO_ERROR;
}
// RBBIDebugPrintf("Trie table size is %d\n", trieSize);
return fTrieSize;
}
@ -327,12 +287,10 @@ int32_t RBBISetBuilder::getTrieSize() /*const*/ {
//
//-----------------------------------------------------------------------------------
void RBBISetBuilder::serializeTrie(uint8_t *where) {
utrie_serialize(fTrie,
where, // Buffer
fTrieSize, // Capacity
getFoldedRBBIValue,
TRUE, // Reduce to 16 bits
fStatus);
utrie2_serialize(fTrie,
where, // Buffer
fTrieSize, // Capacity
fStatus);
}
//------------------------------------------------------------------------

View file

@ -15,10 +15,9 @@
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "rbbirb.h"
#include "utrie2.h"
#include "uvector.h"
struct UNewTrie;
U_NAMESPACE_BEGIN
//
@ -109,7 +108,7 @@ private:
RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
UNewTrie *fTrie; // The mapping TRIE that is the end result of processing
UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
uint32_t fTrieSize; // the Unicode Sets.
// Groups correspond to character categories -

View file

@ -32,8 +32,6 @@
#include "unicode/uchriter.h"
struct UTrie;
U_NAMESPACE_BEGIN
/** @internal */

View file

@ -13,10 +13,9 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import com.ibm.icu.impl.CharTrie;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUBinary.Authenticate;
import com.ibm.icu.impl.Trie;
import com.ibm.icu.impl.Trie2;
/**
* <p>Internal class used for Rule Based Break Iterators</p>
@ -33,20 +32,20 @@ final class RBBIDataWrapper {
short fRTable[];
short fSFTable[];
short fSRTable[];
CharTrie fTrie;
Trie2 fTrie;
String fRuleSource;
int fStatusTable[];
private boolean isBigEndian;
static final int DATA_FORMAT = 0x42726b20; // "Brk "
static final int FORMAT_VERSION = 0x03010000; // 3.1
static final int DATA_FORMAT = 0x42726b20; // "Brk "
static final int FORMAT_VERSION = 0x04000000; // 4.0.0.0
private static final class IsAcceptable implements Authenticate {
// @Override when we switch to Java 6
@Override
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == (FORMAT_VERSION >>> 24);
int intVersion = (version[0] << 24) + (version[1] << 16) + (version[2] << 8) + version[3];
return intVersion == FORMAT_VERSION;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
@ -105,7 +104,6 @@ final class RBBIDataWrapper {
*/
final static class RBBIDataHeader {
int fMagic; // == 0xbla0
int fVersion; // == 1 (for ICU 3.2 and earlier.
byte[] fFormatVersion; // For ICU 3.4 and later.
int fLength; // Total length in bytes of this RBBI Data,
// including all sections, not just the header.
@ -147,19 +145,6 @@ final class RBBIDataWrapper {
return ROW_DATA + state * (fHeader.fCatCount + 4);
}
static class TrieFoldingFunc implements Trie.DataManipulate {
@Override
public int getFoldingOffset(int data) {
if ((data & 0x8000) != 0) {
return data & 0x7fff;
} else {
return 0;
}
}
}
static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
RBBIDataWrapper() {
}
@ -176,10 +161,6 @@ final class RBBIDataWrapper {
// Read in the RBBI data header...
This.fHeader = new RBBIDataHeader();
This.fHeader.fMagic = bytes.getInt();
// Read the same 4 bytes as an int and as a byte array: The data format could be
// the old fVersion=1 (TODO: probably not with a real ICU data header?)
// or the new fFormatVersion=3.x.
This.fHeader.fVersion = bytes.getInt(bytes.position());
This.fHeader.fFormatVersion[0] = bytes.get();
This.fHeader.fFormatVersion[1] = bytes.get();
This.fHeader.fFormatVersion[2] = bytes.get();
@ -203,10 +184,7 @@ final class RBBIDataWrapper {
ICUBinary.skipBytes(bytes, 6 * 4); // uint32_t fReserved[6];
if (This.fHeader.fMagic != 0xb1a0 ||
! (This.fHeader.fVersion == 1 || // ICU 3.2 and earlier
This.fHeader.fFormatVersion[0] == 3) // ICU 3.4
) {
if (This.fHeader.fMagic != 0xb1a0 || !IS_ACCEPTABLE.isDataVersionAcceptable(This.fHeader.fFormatVersion)) {
throw new IOException("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version.");
}
@ -286,7 +264,7 @@ final class RBBIDataWrapper {
// as we don't go more than 100 bytes past the
// past the end of the TRIE.
This.fTrie = new CharTrie(bytes, fTrieFoldingFunc); // Deserialize the TRIE, leaving buffer
This.fTrie = Trie2.createFromSerialized(bytes); // Deserialize the TRIE, leaving buffer
// at an unknown position, preceding the
// padding between TRIE and following section.
@ -461,7 +439,7 @@ final class RBBIDataWrapper {
out.println("\nCharacter Categories");
out.println("--------------------");
for (char32 = 0; char32<=0x10ffff; char32++) {
category = fTrie.getCodePointValue(char32);
category = fTrie.get(char32);
category &= ~0x4000; // Mask off dictionary bit.
if (category < 0 || category > fHeader.fCatCount) {
out.println("Error, bad category " + Integer.toHexString(category) +

View file

@ -25,17 +25,17 @@ class RBBIRuleBuilder {
// This is the main class for building (compiling) break rules into the tables
// required by the runtime RBBI engine.
//
String fDebugEnv; // controls debug trace output
String fRules; // The rule string that we are compiling
RBBIRuleScanner fScanner; // The scanner.
//
// There are four separate parse trees generated, one for each of the
// forward rules, reverse rules, safe forward rules and safe reverse rules.
// This array references the root of each of the trees.
//
//
RBBINode[] fTreeRoots = new RBBINode[4];
static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
static final int fReverseTree = 1; // for each of the trees.
@ -69,7 +69,7 @@ class RBBIRuleBuilder {
// Map Value is the runtime array index.
List<Integer> fRuleStatusVals; // List of Integer objects. Has same layout as the
// runtime array of status (tag) values -
// runtime array of status (tag) values -
// number of values in group 1
// first status value in group 1
// 2nd status value in group 1
@ -84,50 +84,50 @@ class RBBIRuleBuilder {
//
static final int U_BRK_ERROR_START = 0x10200;
/**< Start of codes indicating Break Iterator failures */
static final int U_BRK_INTERNAL_ERROR = 0x10201;
/**< An internal error (bug) was detected. */
static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202;
/**< Hex digits expected as part of a escaped char in a rule. */
static final int U_BRK_SEMICOLON_EXPECTED = 0x10203;
/**< Missing ';' at the end of a RBBI rule. */
static final int U_BRK_RULE_SYNTAX = 0x10204;
/**< Syntax error in RBBI rule. */
static final int U_BRK_UNCLOSED_SET = 0x10205;
/**< UnicodeSet witing an RBBI rule missing a closing ']'. */
static final int U_BRK_ASSIGN_ERROR = 0x10206;
/**< Syntax error in RBBI rule assignment statement. */
static final int U_BRK_VARIABLE_REDFINITION = 0x10207;
/**< RBBI rule $Variable redefined. */
static final int U_BRK_MISMATCHED_PAREN = 0x10208;
/**< Mis-matched parentheses in an RBBI rule. */
static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209;
/**< Missing closing quote in an RBBI rule. */
static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a;
/**< Use of an undefined $Variable in an RBBI rule. */
static final int U_BRK_INIT_ERROR = 0x1020b;
/**< Initialization failure. Probable missing ICU Data. */
static final int U_BRK_RULE_EMPTY_SET = 0x1020c;
/**< Rule contains an empty Unicode Set. */
static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d;
/**< !!option in RBBI rules not recognized. */
static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e;
/**< The {nnn} tag on a rule is mal formed */
static final int U_BRK_MALFORMED_SET = 0x1020f;
static final int U_BRK_ERROR_LIMIT = 0x10210;
/**< This must always be the last value to indicate the limit for Break Iterator failures */
@ -196,7 +196,7 @@ class RBBIRuleBuilder {
//
int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader
header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0;
header[RBBIDataWrapper.DH_FORMATVERSION] = 0x03010000; // uint8_t fFormatVersion[4];
header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION;
header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount.
header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable

View file

@ -14,7 +14,8 @@ import java.util.ArrayList;
import java.util.List;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.IntTrieBuilder;
import com.ibm.icu.impl.Trie2Writable;
import com.ibm.icu.impl.Trie2_16;
//
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
@ -49,14 +50,14 @@ class RBBISetBuilder {
RangeDescriptor() {
fIncludesSets = new ArrayList<RBBINode>();
}
RangeDescriptor(RangeDescriptor other) {
fStartChar = other.fStartChar;
fEndChar = other.fEndChar;
fNum = other.fNum;
fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
}
//-------------------------------------------------------------------------------------
//
// RangeDesriptor::split()
@ -65,20 +66,20 @@ class RBBISetBuilder {
void split(int where) {
Assert.assrt(where>fStartChar && where<=fEndChar);
RangeDescriptor nr = new RangeDescriptor(this);
// RangeDescriptor copy constructor copies all fields.
// Only need to update those that are different after the split.
nr.fStartChar = where;
this.fEndChar = where-1;
nr.fNext = this.fNext;
this.fNext = nr;
// TODO: fIncludesSets is not updated. Check it out.
// Probably because they haven't been populated yet,
// Probably because they haven't been populated yet,
// but still sloppy.
}
//-------------------------------------------------------------------------------------
//
// RangeDescriptor::setDictionaryFlag
@ -95,11 +96,11 @@ class RBBISetBuilder {
// TODO: a faster way would be to find the set node for
// "dictionary" just once, rather than looking it
// up by name every time.
//
//
// -------------------------------------------------------------------------------------
void setDictionaryFlag() {
int i;
for (i=0; i<this.fIncludesSets.size(); i++) {
RBBINode usetNode = fIncludesSets.get(i);
String setName = "";
@ -119,12 +120,13 @@ class RBBISetBuilder {
}
}
RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us.
RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors
IntTrieBuilder fTrie; // The mapping TRIE that is the end result of processing
Trie2Writable fTrie; // The mapping TRIE that is the end result of processing
// the Unicode Sets.
Trie2_16 fFrozenTrie;
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.
@ -135,8 +137,8 @@ class RBBISetBuilder {
int fGroupCount;
boolean fSawBOF;
//------------------------------------------------------------------------
//
// RBBISetBuilder Constructor
@ -162,7 +164,7 @@ class RBBISetBuilder {
// Initialize the process by creating a single range encompassing all characters
// that is in no sets.
//
fRangeList = new RangeDescriptor();
fRangeList = new RangeDescriptor();
fRangeList.fStartChar = 0;
fRangeList.fEndChar = 0x10ffff;
@ -245,7 +247,7 @@ class RBBISetBuilder {
}
if (rlRange.fNum == 0) {
fGroupCount ++;
rlRange.fNum = fGroupCount+2;
rlRange.fNum = fGroupCount+2;
rlRange.setDictionaryFlag();
addValToSets(rlRange.fIncludesSets, fGroupCount+2);
}
@ -260,7 +262,7 @@ class RBBISetBuilder {
// subtree for each UnicodeSet that contains the string {eof}
// Because {bof} and {eof} are not a characters in the normal sense,
// they doesn't affect the computation of ranges or TRIE.
String eofString = "eof";
String bofString = "bof";
@ -279,67 +281,26 @@ class RBBISetBuilder {
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
fTrie = new Trie2Writable(0, // Initial value for all code points
0); // Error value.
//IntTrieBuilder(int aliasdata[], int maxdatalength,
// int initialvalue, int leadunitvalue,
// boolean latin1linear)
fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one)
100000, // Max Data Length
0, // Initial value for all code points
0, // Lead Surrogate unit value,
true); // Keep Latin 1 in separately.
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true);
fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar, rlRange.fNum, true);
}
}
//-----------------------------------------------------------------------------------
//
// RBBIDataManipulate A little internal class needed only to wrap of the
// getFoldedValue() function needed for Trie table creation.
//
//-----------------------------------------------------------------------------------
class RBBIDataManipulate implements IntTrieBuilder.DataManipulate {
public int getFoldedValue(int start, int offset) {
int value;
int limit;
boolean [] inBlockZero = new boolean[1];
limit = start + 0x400;
while(start<limit) {
value = fTrie.getValue(start, inBlockZero);
if (inBlockZero[0]) {
start += IntTrieBuilder.DATA_BLOCK_LENGTH;
} else if (value != 0) {
return offset | 0x08000;
} else {
++start;
}
}
return 0;
}
}
RBBIDataManipulate dm = new RBBIDataManipulate();
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int getTrieSize() {
int size = 0;
try {
// The trie serialize function returns the size of the data written.
// null output stream says give size only, don't actually write anything.
size = fTrie.serialize(null, true, dm );
} catch (IOException e) {
Assert.assrt (false);
if (fFrozenTrie == null) {
fFrozenTrie = fTrie.toTrie2_16();
fTrie = null;
}
return size;
return fFrozenTrie.getSerializedLength();
}
@ -349,7 +310,11 @@ class RBBISetBuilder {
//
//-----------------------------------------------------------------------------------
void serializeTrie(OutputStream os) throws IOException {
fTrie.serialize(os, true, dm );
if (fFrozenTrie == null) {
fFrozenTrie = fTrie.toTrie2_16();
fTrie = null;
}
fFrozenTrie.serialize(os);
}
//------------------------------------------------------------------------
@ -416,7 +381,7 @@ class RBBISetBuilder {
//------------------------------------------------------------------------
//
// getFirstChar Given a runtime RBBI character category, find
// the first UChar32 that is in the set of chars
// the first UChar32 that is in the set of chars
// in the category.
//------------------------------------------------------------------------
int getFirstChar(int category) {

View file

@ -24,10 +24,10 @@ import java.util.ArrayList;
import java.util.List;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.CharTrie;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.Trie2;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
@ -495,7 +495,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI();
int foundBreakCount = 0;
int c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
// Is the character we're starting on a dictionary character? If so, we
// need to back up to include the entire run; otherwise the results of
@ -507,7 +507,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
do {
CharacterIteration.next32(fText);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
} while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);
// Back up to the last dictionary character
@ -524,7 +524,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
else {
do {
c = CharacterIteration.previous32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0));
// Back up to the last dictionary character
@ -538,7 +538,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
rangeStart = fText.getIndex();
}
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
@ -550,14 +550,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
if (reverse) {
fText.setIndex(rangeStart);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
LanguageBreakEngine lbe = null;
while(true) {
while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
CharacterIteration.next32(fText);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
if (current >= rangeEnd) {
break;
@ -577,7 +577,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// Reload the loop variables for the next go-round
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
// If we found breaks, build a new break cache. The first and last entries must
@ -1285,7 +1285,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// caches for quicker access
CharacterIterator text = fText;
CharTrie trie = fRData.fTrie;
Trie2 trie = fRData.fTrie;
// Set up the starting char
int c = text.current();
@ -1338,7 +1338,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
category = (short) trie.getCodePointValue(c);
category = (short) trie.get(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
@ -1483,10 +1483,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
mainLoop: for (;;) {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END || fRData.fHeader.fVersion == 1) {
// Either this is the old (ICU 3.2 and earlier) format data which
// does not support explicit support for matching {eof}, or
// we have already done the {eof} iteration. Now is the time
if (mode == RBBI_END) {
// We have already done the {eof} iteration. Now is the time
// to unconditionally bail out.
if (result == initialPosition) {
// Ran off start, no match found.
@ -1504,7 +1502,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// look up the current character's category, which tells us
// which column in the state table to look at.
//
category = (short) fRData.fTrie.getCodePointValue(c);
category = (short) fRData.fTrie.get(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d4b1866a85ceb079d912a3283e5ec6a7d6988df8c0e56e98fd67def82c35dcf3
size 12225515
oid sha256:f0d65ed59329e1eaae1813db0fa8e1236a3b58ddfa5e7e1ff33d4bea7eef3c31
size 12226292