mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-13565 Break Iteration, remove the dictionary bit from the implementation.
For identifying text that needs to be handled by a word dictionary for Break Iteration, change from using a bit in the character category to sorting all dictionary categories together, and recording the boundary between the non-dictionary and dictionary ranges. This is internal to the implementaion. It does not affect behavior. It does increase the number of character categories that can be handled using a compact 8 bit Trie, from 127 to 255.
This commit is contained in:
parent
85aee40cc3
commit
1eef362329
17 changed files with 326 additions and 307 deletions
|
@ -763,15 +763,15 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
|||
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
|
||||
if (statetable->fFlags & RBBI_8BITS_ROWS) {
|
||||
if (use8BitsTrie) {
|
||||
return handleNext<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>();
|
||||
return handleNext<RBBIStateTableRow8, TrieFunc8>();
|
||||
} else {
|
||||
return handleNext<RBBIStateTableRow8, TrieFunc16, kDictBit>();
|
||||
return handleNext<RBBIStateTableRow8, TrieFunc16>();
|
||||
}
|
||||
} else {
|
||||
if (use8BitsTrie) {
|
||||
return handleNext<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>();
|
||||
return handleNext<RBBIStateTableRow16, TrieFunc8>();
|
||||
} else {
|
||||
return handleNext<RBBIStateTableRow16, TrieFunc16, kDictBit>();
|
||||
return handleNext<RBBIStateTableRow16, TrieFunc16>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -781,15 +781,15 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
|||
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
|
||||
if (statetable->fFlags & RBBI_8BITS_ROWS) {
|
||||
if (use8BitsTrie) {
|
||||
return handleSafePrevious<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
|
||||
return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
|
||||
} else {
|
||||
return handleSafePrevious<RBBIStateTableRow8, TrieFunc16, kDictBit>(fromPosition);
|
||||
return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
|
||||
}
|
||||
} else {
|
||||
if (use8BitsTrie) {
|
||||
return handleSafePrevious<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
|
||||
return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
|
||||
} else {
|
||||
return handleSafePrevious<RBBIStateTableRow16, TrieFunc16, kDictBit>(fromPosition);
|
||||
return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -801,7 +801,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
|||
// Run the state machine to find a boundary
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
|
||||
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
|
||||
int32_t RuleBasedBreakIterator::handleNext() {
|
||||
int32_t state;
|
||||
uint16_t category = 0;
|
||||
|
@ -815,6 +815,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
|||
const RBBIStateTable *statetable = fData->fForwardTable;
|
||||
const char *tableData = statetable->fTableData;
|
||||
uint32_t tableRowLen = statetable->fRowLen;
|
||||
uint32_t dictStart = statetable->fDictCategoriesStart;
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
RBBIDebugPuts("Handle Next pos char state category");
|
||||
|
@ -876,17 +877,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
|||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
category = trieFunc(fData->fTrie, c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iteration.
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & dictMask) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~dictMask;
|
||||
}
|
||||
fDictionaryCharCount += (category >= dictStart);
|
||||
}
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
|
@ -993,7 +984,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
|||
// because the safe table does not require as many options.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
|
||||
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
|
||||
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
||||
|
||||
int32_t state;
|
||||
|
@ -1030,7 +1021,6 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
|||
//
|
||||
// Off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = trieFunc(fData->fTrie, c);
|
||||
category &= ~dictMask;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
|
|
|
@ -119,8 +119,6 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
|
|||
|
||||
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
|
||||
int32_t firstRuleStatus, int32_t otherRuleStatus) {
|
||||
uint32_t dictMask = ucptrie_getValueWidth(fBI->fData->fTrie) == UCPTRIE_VALUE_BITS_8 ?
|
||||
kDictBitFor8BitsTrie : kDictBit;
|
||||
if ((endPos - startPos) <= 1) {
|
||||
return;
|
||||
}
|
||||
|
@ -145,9 +143,11 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
|||
utext_setNativeIndex(text, rangeStart);
|
||||
UChar32 c = utext_current32(text);
|
||||
category = ucptrie_get(fBI->fData->fTrie, c);
|
||||
uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
|
||||
|
||||
while(U_SUCCESS(status)) {
|
||||
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & dictMask) == 0) {
|
||||
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
|
||||
&& (category < dictStart)) {
|
||||
utext_next32(text); // TODO: cleaner loop structure.
|
||||
c = utext_current32(text);
|
||||
category = ucptrie_get(fBI->fData->fTrie, c);
|
||||
|
|
|
@ -101,18 +101,18 @@ struct RBBIStateTableRowT {
|
|||
// Value 0: not an accepting state.
|
||||
// 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
|
||||
// >1: Look-ahead match has completed.
|
||||
// Actual boundary position happened earlier
|
||||
// Actual boundary position happened earlier.
|
||||
// Value here == fLookAhead in earlier
|
||||
// state, at actual boundary pos.
|
||||
// state, at actual boundary pos.
|
||||
T fLookAhead; // Non-zero if this row is for a state that
|
||||
// corresponds to a '/' in the rule source.
|
||||
// Value is the same as the fAccepting
|
||||
// value for the rule (which will appear
|
||||
// in a different state.
|
||||
// value for the rule (which will appear
|
||||
// in a different state.
|
||||
T fTagsIdx; // Non-zero if this row covers a {tagged} position
|
||||
// from a rule. Value is the index in the
|
||||
// StatusTable of the set of matching
|
||||
// tags (rule status values)
|
||||
// from a rule. Value is the index in the
|
||||
// StatusTable of the set of matching
|
||||
// tags (rule status values)
|
||||
T fNextState[1]; // Next State, indexed by char category.
|
||||
// Variable-length array declared with length 1
|
||||
// to disable bounds checkers.
|
||||
|
@ -132,14 +132,17 @@ union RBBIStateTableRow {
|
|||
};
|
||||
|
||||
struct RBBIStateTable {
|
||||
uint32_t fNumStates; /* Number of states. */
|
||||
uint32_t fRowLen; /* Length of a state table row, in bytes. */
|
||||
uint32_t fFlags; /* Option Flags for this state table */
|
||||
char fTableData[1]; /* First RBBIStateTableRow begins here. */
|
||||
/* Variable-length array declared with length 1 */
|
||||
/* to disable bounds checkers. */
|
||||
/* (making it char[] simplifies ugly address */
|
||||
/* arithmetic for indexing variable length rows.) */
|
||||
uint32_t fNumStates; // Number of states.
|
||||
uint32_t fRowLen; // Length of a state table row, in bytes.
|
||||
uint32_t fDictCategoriesStart; // Char category number of the first dictionary
|
||||
// char class, or the the largest category number + 1
|
||||
// if there are no dictionary categories.
|
||||
uint32_t fFlags; // Option Flags for this state table.
|
||||
char fTableData[1]; // First RBBIStateTableRow begins here.
|
||||
// Variable-length array declared with length 1
|
||||
// to disable bounds checkers.
|
||||
// (making it char[] simplifies ugly address
|
||||
// arithmetic for indexing variable length rows.)
|
||||
};
|
||||
|
||||
constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;
|
||||
|
|
|
@ -287,9 +287,7 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
|
|||
|
||||
//
|
||||
// UnicodeSet processing.
|
||||
// Munge the Unicode Sets to create a set of character categories.
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
// Munge the Unicode Sets to create an initial set of character categories.
|
||||
//
|
||||
fSetBuilder->buildRanges();
|
||||
|
||||
|
@ -303,6 +301,12 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
|
|||
}
|
||||
|
||||
fForwardTable->buildForwardTable();
|
||||
|
||||
// State table and character category optimization.
|
||||
// Merge equivalent rows and columns.
|
||||
// Note that this process alters the initial set of character categories,
|
||||
// causing the representation of UnicodeSets in the parse tree to become invalid.
|
||||
|
||||
optimizeTables();
|
||||
fForwardTable->buildSafeReverseTable(status);
|
||||
|
||||
|
@ -315,6 +319,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
|
|||
}
|
||||
#endif
|
||||
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
//
|
||||
fSetBuilder->buildTrie();
|
||||
|
||||
//
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
// by the RBBI rules.
|
||||
// - compute a set of non-overlapping character ranges
|
||||
// with all characters within a range belonging to the same
|
||||
// set of input uniocde sets.
|
||||
// set of input unicode sets.
|
||||
// - Derive a set of non-overlapping UnicodeSet (like things)
|
||||
// that will correspond to columns in the state table for
|
||||
// the RBBI execution engine. All characters within one
|
||||
|
@ -45,7 +45,7 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
const int32_t kMaxCharCategoriesFor8BitsTrie = 127;
|
||||
const int32_t kMaxCharCategoriesFor8BitsTrie = 255;
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor
|
||||
|
@ -55,12 +55,12 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
|
|||
{
|
||||
fRB = rb;
|
||||
fStatus = rb->fStatus;
|
||||
fRangeList = 0;
|
||||
fRangeList = nullptr;
|
||||
fMutableTrie = nullptr;
|
||||
fTrie = nullptr;
|
||||
fTrieSize = 0;
|
||||
fGroupCount = 0;
|
||||
fSawBOF = FALSE;
|
||||
fSawBOF = false;
|
||||
}
|
||||
|
||||
|
||||
|
@ -196,25 +196,48 @@ void RBBISetBuilder::buildRanges() {
|
|||
//
|
||||
// Numbering: # 0 (state table column 0) is unused.
|
||||
// # 1 is reserved - table column 1 is for end-of-input
|
||||
// # 2 is reserved - table column 2 is for beginning-in-input
|
||||
// # 2 is reserved - table column 2 is for beginning-of-input
|
||||
// # 3 is the first range list.
|
||||
//
|
||||
RangeDescriptor *rlSearchRange;
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
int32_t dictGroupCount = 0;
|
||||
|
||||
for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
|
||||
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
|
||||
if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
|
||||
rlRange->fNum = rlSearchRange->fNum;
|
||||
rlRange->fIncludesDict = rlSearchRange->fIncludesDict;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rlRange->fNum == 0) {
|
||||
fGroupCount ++;
|
||||
rlRange->fNum = fGroupCount+2;
|
||||
rlRange->setDictionaryFlag();
|
||||
addValToSets(rlRange->fIncludesSets, fGroupCount+2);
|
||||
rlRange->fFirstInGroup = true;
|
||||
if (rlRange->isDictionaryRange()) {
|
||||
rlRange->fNum = ++dictGroupCount;
|
||||
rlRange->fIncludesDict = true;
|
||||
} else {
|
||||
fGroupCount++;
|
||||
rlRange->fNum = fGroupCount+2;
|
||||
addValToSets(rlRange->fIncludesSets, rlRange->fNum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Move the character category numbers for any dictionary ranges up, so that they
|
||||
// immediately follow the non-dictionary ranges.
|
||||
|
||||
fDictCategoriesStart = fGroupCount + 3;
|
||||
for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
|
||||
if (rlRange->fIncludesDict) {
|
||||
rlRange->fNum += fDictCategoriesStart - 1;
|
||||
if (rlRange->fFirstInGroup) {
|
||||
addValToSets(rlRange->fIncludesSets, rlRange->fNum);
|
||||
}
|
||||
}
|
||||
}
|
||||
fGroupCount += dictGroupCount;
|
||||
|
||||
|
||||
// Handle input sets that contain the special string {eof}.
|
||||
// Column 1 of the state table is reserved for EOF on input.
|
||||
// Column 2 is reserved for before-the-start-input.
|
||||
|
@ -222,13 +245,11 @@ void RBBISetBuilder::buildRanges() {
|
|||
// references to {bof}.)
|
||||
// Add this column value (1 or 2) to the equivalent expression
|
||||
// subtree for each UnicodeSet that contains the string {eof}
|
||||
// Because {bof} and {eof} are not a characters in the normal sense,
|
||||
// they doesn't affect the computation of ranges or TRIE.
|
||||
static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
|
||||
static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
|
||||
// Because {bof} and {eof} are not characters in the normal sense,
|
||||
// they don't affect the computation of the ranges or TRIE.
|
||||
|
||||
UnicodeString eofString(eofUString);
|
||||
UnicodeString bofString(bofUString);
|
||||
UnicodeString eofString(u"eof");
|
||||
UnicodeString bofString(u"bof");
|
||||
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
|
||||
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
|
||||
if (usetNode==NULL) {
|
||||
|
@ -255,24 +276,16 @@ void RBBISetBuilder::buildRanges() {
|
|||
// range group number.
|
||||
//
|
||||
void RBBISetBuilder::buildTrie() {
|
||||
RangeDescriptor *rlRange;
|
||||
|
||||
fMutableTrie = umutablecptrie_open(
|
||||
0, // Initial value for all code points.
|
||||
0, // Error value for out-of-range input.
|
||||
fStatus);
|
||||
|
||||
bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
|
||||
for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
|
||||
uint32_t value = rlRange->fNum;
|
||||
if (use8Bits && ((value & RuleBasedBreakIterator::kDictBit) != 0)) {
|
||||
U_ASSERT((value & RuleBasedBreakIterator::kDictBitFor8BitsTrie) == 0);
|
||||
value = RuleBasedBreakIterator::kDictBitFor8BitsTrie | (value & ~RuleBasedBreakIterator::kDictBit);
|
||||
}
|
||||
for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) {
|
||||
umutablecptrie_setRange(fMutableTrie,
|
||||
rlRange->fStartChar, // Range start
|
||||
rlRange->fEndChar, // Range end (inclusive)
|
||||
value, // value for range
|
||||
range->fStartChar, // Range start
|
||||
range->fEndChar, // Range end (inclusive)
|
||||
range->fNum, // value for range
|
||||
fStatus);
|
||||
}
|
||||
}
|
||||
|
@ -281,16 +294,21 @@ void RBBISetBuilder::buildTrie() {
|
|||
void RBBISetBuilder::mergeCategories(IntPair categories) {
|
||||
U_ASSERT(categories.first >= 1);
|
||||
U_ASSERT(categories.second > categories.first);
|
||||
U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) ||
|
||||
(categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
|
||||
|
||||
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
|
||||
int32_t rangeNum = rd->fNum & ~RuleBasedBreakIterator::kDictBit;
|
||||
int32_t rangeDict = rd->fNum & RuleBasedBreakIterator::kDictBit;
|
||||
int32_t rangeNum = rd->fNum;
|
||||
if (rangeNum == categories.second) {
|
||||
rd->fNum = categories.first | rangeDict;
|
||||
rd->fNum = categories.first;
|
||||
} else if (rangeNum > categories.second) {
|
||||
rd->fNum--;
|
||||
}
|
||||
}
|
||||
--fGroupCount;
|
||||
if (categories.second <= fDictCategoriesStart) {
|
||||
--fDictCategoriesStart;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -395,6 +413,16 @@ int32_t RBBISetBuilder::getNumCharCategories() const {
|
|||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// getDictCategoriesStart
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
int32_t RBBISetBuilder::getDictCategoriesStart() const {
|
||||
return fDictCategoriesStart;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// sawBOF
|
||||
|
@ -414,7 +442,7 @@ UBool RBBISetBuilder::sawBOF() const {
|
|||
UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
|
||||
RangeDescriptor *rlRange;
|
||||
UChar32 retVal = (UChar32)-1;
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
|
||||
if (rlRange->fNum == category) {
|
||||
retVal = rlRange->fStartChar;
|
||||
break;
|
||||
|
@ -424,7 +452,6 @@ UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// printRanges A debugging function.
|
||||
|
@ -437,16 +464,16 @@ void RBBISetBuilder::printRanges() {
|
|||
int i;
|
||||
|
||||
RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
|
||||
for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
|
||||
RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar);
|
||||
|
||||
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
|
||||
UnicodeString setName = UNICODE_STRING("anon", 4);
|
||||
UnicodeString setName {u"anon"};
|
||||
RBBINode *setRef = usetNode->fParent;
|
||||
if (setRef != NULL) {
|
||||
if (setRef != nullptr) {
|
||||
RBBINode *varRef = setRef->fParent;
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
}
|
||||
}
|
||||
|
@ -466,19 +493,15 @@ void RBBISetBuilder::printRanges() {
|
|||
//------------------------------------------------------------------------
|
||||
#ifdef RBBI_DEBUG
|
||||
void RBBISetBuilder::printRangeGroups() {
|
||||
RangeDescriptor *rlRange;
|
||||
RangeDescriptor *tRange;
|
||||
int i;
|
||||
int lastPrintedGroupNum = 0;
|
||||
|
||||
RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
int groupNum = rlRange->fNum & 0xbfff;
|
||||
if (groupNum > lastPrintedGroupNum) {
|
||||
lastPrintedGroupNum = groupNum;
|
||||
for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
|
||||
if (rlRange->fFirstInGroup) {
|
||||
int groupNum = rlRange->fNum;
|
||||
RBBIDebugPrintf("%2i ", groupNum);
|
||||
|
||||
if (rlRange->fNum & RuleBasedBreakIterator::kDictBit) { RBBIDebugPrintf(" <DICT> ");}
|
||||
if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");}
|
||||
|
||||
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
|
||||
|
@ -494,7 +517,7 @@ void RBBISetBuilder::printRangeGroups() {
|
|||
}
|
||||
|
||||
i = 0;
|
||||
for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
|
||||
for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) {
|
||||
if (tRange->fNum == rlRange->fNum) {
|
||||
if (i++ % 5 == 0) {
|
||||
RBBIDebugPrintf("\n ");
|
||||
|
@ -561,28 +584,22 @@ void RBBISetBuilder::printSets() {
|
|||
//
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
||||
RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
|
||||
int i;
|
||||
RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) :
|
||||
fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum},
|
||||
fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} {
|
||||
|
||||
this->fStartChar = other.fStartChar;
|
||||
this->fEndChar = other.fEndChar;
|
||||
this->fNum = other.fNum;
|
||||
this->fNext = NULL;
|
||||
UErrorCode oldstatus = status;
|
||||
this->fIncludesSets = new UVector(status);
|
||||
if (U_FAILURE(oldstatus)) {
|
||||
status = oldstatus;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fIncludesSets = new UVector(status);
|
||||
if (this->fIncludesSets == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
/* test for NULL */
|
||||
if (this->fIncludesSets == 0) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
for (i=0; i<other.fIncludesSets->size(); i++) {
|
||||
for (int32_t i=0; i<other.fIncludesSets->size(); i++) {
|
||||
this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
|
||||
}
|
||||
}
|
||||
|
@ -594,24 +611,13 @@ RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &statu
|
|||
//
|
||||
//-------------------------------------------------------------------------------------
|
||||
RangeDescriptor::RangeDescriptor(UErrorCode &status) {
|
||||
this->fStartChar = 0;
|
||||
this->fEndChar = 0;
|
||||
this->fNum = 0;
|
||||
this->fNext = NULL;
|
||||
UErrorCode oldstatus = status;
|
||||
this->fIncludesSets = new UVector(status);
|
||||
if (U_FAILURE(oldstatus)) {
|
||||
status = oldstatus;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
/* test for NULL */
|
||||
if(this->fIncludesSets == 0) {
|
||||
fIncludesSets = new UVector(status);
|
||||
if (fIncludesSets == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -622,7 +628,7 @@ RangeDescriptor::RangeDescriptor(UErrorCode &status) {
|
|||
//-------------------------------------------------------------------------------------
|
||||
RangeDescriptor::~RangeDescriptor() {
|
||||
delete fIncludesSets;
|
||||
fIncludesSets = NULL;
|
||||
fIncludesSets = nullptr;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
@ -633,7 +639,7 @@ RangeDescriptor::~RangeDescriptor() {
|
|||
void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
|
||||
U_ASSERT(where>fStartChar && where<=fEndChar);
|
||||
RangeDescriptor *nr = new RangeDescriptor(*this, status);
|
||||
if(nr == 0) {
|
||||
if(nr == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
@ -652,27 +658,22 @@ void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
|
|||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDescriptor::setDictionaryFlag
|
||||
// RangeDescriptor::isDictionaryRange
|
||||
//
|
||||
// Character Category Numbers that include characters from
|
||||
// the original Unicode Set named "dictionary" have bit 14
|
||||
// set to 1. The RBBI runtime engine uses this to trigger
|
||||
// use of the word dictionary.
|
||||
// Test whether this range includes characters from
|
||||
// the original Unicode Set named "dictionary".
|
||||
//
|
||||
// This function looks through the Unicode Sets that it
|
||||
// (the range) includes, and sets the bit in fNum when
|
||||
// "dictionary" is among them.
|
||||
// This function looks through the Unicode Sets that
|
||||
// the range includes, checking for one named "dictionary"
|
||||
//
|
||||
// TODO: a faster way would be to find the set node for
|
||||
// "dictionary" just once, rather than looking it
|
||||
// up by name every time.
|
||||
//
|
||||
//-------------------------------------------------------------------------------------
|
||||
void RangeDescriptor::setDictionaryFlag() {
|
||||
int i;
|
||||
|
||||
bool RangeDescriptor::isDictionaryRange() {
|
||||
static const char16_t *dictionary = u"dictionary";
|
||||
for (i=0; i<fIncludesSets->size(); i++) {
|
||||
for (int32_t i=0; i<fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
|
||||
RBBINode *setRef = usetNode->fParent;
|
||||
if (setRef != nullptr) {
|
||||
|
@ -680,16 +681,14 @@ void RangeDescriptor::setDictionaryFlag() {
|
|||
if (varRef && varRef->fType == RBBINode::varRef) {
|
||||
const UnicodeString *setName = &varRef->fText;
|
||||
if (setName->compare(dictionary, -1) == 0) {
|
||||
fNum |= RuleBasedBreakIterator::kDictBit;
|
||||
break;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
|
@ -41,25 +41,26 @@ U_NAMESPACE_BEGIN
|
|||
//
|
||||
class RangeDescriptor : public UMemory {
|
||||
public:
|
||||
UChar32 fStartChar; // Start of range, unicode 32 bit value.
|
||||
UChar32 fEndChar; // End of range, unicode 32 bit value.
|
||||
int32_t fNum; // runtime-mapped input value for this range.
|
||||
UVector *fIncludesSets; // vector of the the original
|
||||
// Unicode sets that include this range.
|
||||
// (Contains ptrs to uset nodes)
|
||||
RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
|
||||
UChar32 fStartChar {}; // Start of range, unicode 32 bit value.
|
||||
UChar32 fEndChar {}; // End of range, unicode 32 bit value.
|
||||
int32_t fNum {0}; // runtime-mapped input value for this range.
|
||||
bool fIncludesDict {false}; // True if the range includes $dictionary.
|
||||
bool fFirstInGroup {false}; // True if first range in a group with the same fNum.
|
||||
UVector *fIncludesSets {nullptr}; // vector of the the original
|
||||
// Unicode sets that include this range.
|
||||
// (Contains ptrs to uset nodes)
|
||||
RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list.
|
||||
|
||||
RangeDescriptor(UErrorCode &status);
|
||||
RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
|
||||
~RangeDescriptor();
|
||||
void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
|
||||
// where appearing in the second (higher) part.
|
||||
void setDictionaryFlag(); // Check whether this range appears as part of
|
||||
bool isDictionaryRange(); // Check whether this range appears as part of
|
||||
// the Unicode set named "dictionary"
|
||||
|
||||
private:
|
||||
RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
|
||||
RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
|
||||
RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
|
||||
RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
|
||||
};
|
||||
|
||||
|
||||
|
@ -90,6 +91,8 @@ public:
|
|||
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
|
||||
// runtime state machine, which are the same as
|
||||
// columns in the DFA state table
|
||||
int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or
|
||||
// last category + 1 if there are no dictionary categories.
|
||||
int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
|
||||
void serializeTrie(uint8_t *where); // write out the serialized Trie.
|
||||
UChar32 getFirstChar(int32_t val) const;
|
||||
|
@ -113,8 +116,6 @@ public:
|
|||
#endif
|
||||
|
||||
private:
|
||||
void numberSets();
|
||||
|
||||
RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
|
||||
UErrorCode *fStatus;
|
||||
|
||||
|
@ -124,14 +125,13 @@ private:
|
|||
UCPTrie *fTrie; // the Unicode Sets.
|
||||
uint32_t fTrieSize;
|
||||
|
||||
// Groups correspond to character categories -
|
||||
// groups of ranges that are in the same original UnicodeSets.
|
||||
// fGroupCount is the index of the last used group.
|
||||
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
|
||||
// State table column 0 is not used. Column 1 is for end-of-input.
|
||||
// column 2 is for group 0. Funny counting.
|
||||
// Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
|
||||
int32_t fGroupCount;
|
||||
|
||||
// The number of the first dictionary char category.
|
||||
// If there are no Dictionary categories, set to the last category + 1.
|
||||
int32_t fDictCategoriesStart;
|
||||
|
||||
UBool fSawBOF;
|
||||
|
||||
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
|
||||
|
|
|
@ -1155,7 +1155,13 @@ bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
|
|||
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
|
||||
for (; categories->first < numCols-1; categories->first++) {
|
||||
for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
|
||||
// Note: dictionary & non-dictionary columns cannot be merged.
|
||||
// The limitSecond value prevents considering mixed pairs.
|
||||
// Dictionary categories are >= DictCategoriesStart.
|
||||
// Non dict categories are < DictCategoriesStart.
|
||||
int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
|
||||
fRB->fSetBuilder->getDictCategoriesStart() : numCols;
|
||||
for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
|
||||
// Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
|
||||
uint16_t table_base = 0;
|
||||
uint16_t table_dupl = 1;
|
||||
|
@ -1379,6 +1385,7 @@ void RBBITableBuilder::exportTable(void *where) {
|
|||
}
|
||||
|
||||
table->fNumStates = fDStates->size();
|
||||
table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
|
||||
table->fFlags = 0;
|
||||
if (use8BitsForTable()) {
|
||||
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
|
||||
|
@ -1652,12 +1659,12 @@ void RBBITableBuilder::printStates() {
|
|||
RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
|
||||
RBBIDebugPrintf(" | Acc LA Tag");
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
|
||||
RBBIDebugPrintf(" %2d", c);
|
||||
RBBIDebugPrintf(" %3d", c);
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
RBBIDebugPrintf(" |---------------");
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
|
||||
RBBIDebugPrintf("---");
|
||||
RBBIDebugPrintf("----");
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
|
||||
|
@ -1666,7 +1673,7 @@ void RBBITableBuilder::printStates() {
|
|||
RBBIDebugPrintf(" %3d | " , n);
|
||||
RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
|
||||
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
|
||||
RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
|
||||
RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
|
||||
}
|
||||
RBBIDebugPrintf("\n");
|
||||
}
|
||||
|
|
|
@ -677,10 +677,10 @@ private:
|
|||
|
||||
typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
|
||||
|
||||
template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
|
||||
template<typename RowType, PTrieFunc trieFunc>
|
||||
int32_t handleSafePrevious(int32_t fromPosition);
|
||||
|
||||
template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
|
||||
template<typename RowType, PTrieFunc trieFunc>
|
||||
int32_t handleNext();
|
||||
|
||||
|
||||
|
@ -705,17 +705,6 @@ private:
|
|||
* @internal
|
||||
*/
|
||||
void dumpTables();
|
||||
|
||||
/**
|
||||
* Bit for dictionary based category
|
||||
*/
|
||||
static constexpr int32_t kDictBit = 0x4000;
|
||||
|
||||
/**
|
||||
* Bit for dictionary based category in 8bits trie
|
||||
*/
|
||||
static constexpr int32_t kDictBitFor8BitsTrie = 0x0080;
|
||||
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
};
|
||||
|
||||
|
|
|
@ -4657,7 +4657,8 @@ void RBBITest::TestTableRedundancies() {
|
|||
}
|
||||
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
|
||||
for (int c1=1; c1<numCharClasses; c1++) {
|
||||
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
|
||||
int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
|
||||
for (int c2 = c1+1; c2 < limit; c2++) {
|
||||
if (columns.at(c1) == columns.at(c2)) {
|
||||
errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
|
||||
goto out;
|
||||
|
@ -4952,15 +4953,15 @@ void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits
|
|||
}
|
||||
|
||||
void RBBITest::Test8BitsTrieWith8BitStateTable() {
|
||||
testTrieStateTable(123, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
|
||||
testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
void RBBITest::Test16BitsTrieWith8BitStateTable() {
|
||||
testTrieStateTable(124, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
|
||||
testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
void RBBITest::Test16BitsTrieWith16BitStateTable() {
|
||||
testTrieStateTable(255, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
|
||||
testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
void RBBITest::Test8BitsTrieWith16BitStateTable() {
|
||||
|
|
|
@ -41,10 +41,20 @@ public final class RBBIDataWrapper {
|
|||
* Length of a table row in bytes. Note mismatch with table data, which is short[].
|
||||
*/
|
||||
public int fRowLen;
|
||||
/**
|
||||
* Char category number of the first dictionary char class,
|
||||
* or the the largest category number + 1 if there are no dictionary categories.
|
||||
*/
|
||||
public int fDictCategoriesStart;
|
||||
/**
|
||||
* Option Flags for this state table.
|
||||
*/
|
||||
public int fFlags;
|
||||
/**
|
||||
* Length in bytes of the state table header, of all the int32 fields
|
||||
* preceding fTable in the serialized form.
|
||||
*/
|
||||
public static int fHeaderSize = 16;
|
||||
/**
|
||||
* Linear array of next state values, accessed as short[state, char_class]
|
||||
*/
|
||||
|
@ -57,14 +67,15 @@ public final class RBBIDataWrapper {
|
|||
if (length == 0) {
|
||||
return null;
|
||||
}
|
||||
if (length < 12) {
|
||||
if (length < fHeaderSize) {
|
||||
throw new IOException("Invalid RBBI state table length.");
|
||||
}
|
||||
RBBIStateTable This = new RBBIStateTable();
|
||||
This.fNumStates = bytes.getInt();
|
||||
This.fRowLen = bytes.getInt();
|
||||
This.fDictCategoriesStart = bytes.getInt();
|
||||
This.fFlags = bytes.getInt();
|
||||
int lengthOfTable = length - 12; // length in bytes.
|
||||
int lengthOfTable = length - fHeaderSize; // length in bytes.
|
||||
boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS;
|
||||
if (use8Bits) {
|
||||
This.fTable = new char[lengthOfTable];
|
||||
|
@ -82,6 +93,7 @@ public final class RBBIDataWrapper {
|
|||
public int put(DataOutputStream bytes) throws IOException {
|
||||
bytes.writeInt(fNumStates);
|
||||
bytes.writeInt(fRowLen);
|
||||
bytes.writeInt(fDictCategoriesStart);
|
||||
bytes.writeInt(fFlags);
|
||||
if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) {
|
||||
int tableLen = fRowLen * fNumStates; // fRowLen is bytes.
|
||||
|
@ -95,8 +107,8 @@ public final class RBBIDataWrapper {
|
|||
bytes.writeChar(fTable[i]);
|
||||
}
|
||||
}
|
||||
int bytesWritten = 12 + fRowLen * fNumStates; // total bytes written,
|
||||
// including 12 for the header.
|
||||
int bytesWritten = fHeaderSize + fRowLen * fNumStates; // total bytes written,
|
||||
// including the header.
|
||||
while (bytesWritten % 8 != 0) {
|
||||
bytes.writeByte(0);
|
||||
++bytesWritten;
|
||||
|
@ -118,6 +130,7 @@ public final class RBBIDataWrapper {
|
|||
RBBIStateTable otherST = (RBBIStateTable)other;
|
||||
if (fNumStates != otherST.fNumStates) return false;
|
||||
if (fRowLen != otherST.fRowLen) return false;
|
||||
if (fDictCategoriesStart != otherST.fDictCategoriesStart) return false;
|
||||
if (fFlags != otherST.fFlags) return false;
|
||||
return Arrays.equals(fTable, otherST.fTable);
|
||||
}
|
||||
|
@ -216,9 +229,6 @@ public final class RBBIDataWrapper {
|
|||
public final static int RBBI_BOF_REQUIRED = 2;
|
||||
public final static int RBBI_8BITS_ROWS = 4;
|
||||
|
||||
public final static int DICT_BIT = 0x4000;
|
||||
public final static int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
|
||||
|
||||
/**
|
||||
* Data Header. A struct-like class with the fields from the RBBI data file header.
|
||||
* Not intended for public use, declared public for testing purposes only.
|
||||
|
@ -496,7 +506,6 @@ public final class RBBIDataWrapper {
|
|||
int char32;
|
||||
int category;
|
||||
int lastNewline[] = new int[n+1];
|
||||
int dictMask = fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? DICT_BIT_FOR_8BITS_TRIE : DICT_BIT;
|
||||
|
||||
for (category = 0; category <= fHeader.fCatCount; category ++) {
|
||||
catStrings[category] = "";
|
||||
|
@ -505,7 +514,6 @@ public final class RBBIDataWrapper {
|
|||
out.println("--------------------");
|
||||
for (char32 = 0; char32<=0x10ffff; char32++) {
|
||||
category = fTrie.get(char32);
|
||||
category &= ~dictMask; // Mask off dictionary bit.
|
||||
if (category < 0 || category > fHeader.fCatCount) {
|
||||
out.println("Error, bad category " + Integer.toHexString(category) +
|
||||
" for char " + Integer.toHexString(char32));
|
||||
|
|
|
@ -67,7 +67,7 @@ class RBBIRuleBuilder {
|
|||
//
|
||||
// Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
|
||||
//
|
||||
Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far.
|
||||
Map<Set<Integer>, Integer> fStatusSets = new HashMap<>(); // Status value sets encountered so far.
|
||||
// Map Key is the set of values.
|
||||
// Map Value is the runtime array index.
|
||||
|
||||
|
@ -146,8 +146,8 @@ class RBBIRuleBuilder {
|
|||
ICUDebug.value("rbbi") : null;
|
||||
fRules = rules;
|
||||
fStrippedRules = new StringBuilder(rules);
|
||||
fUSetNodes = new ArrayList<RBBINode>();
|
||||
fRuleStatusVals = new ArrayList<Integer>();
|
||||
fUSetNodes = new ArrayList<>();
|
||||
fRuleStatusVals = new ArrayList<>();
|
||||
fScanner = new RBBIRuleScanner(this);
|
||||
fSetBuilder = new RBBISetBuilder(this);
|
||||
}
|
||||
|
@ -294,9 +294,7 @@ class RBBIRuleBuilder {
|
|||
|
||||
//
|
||||
// UnicodeSet processing.
|
||||
// Munge the Unicode Sets to create a set of character categories.
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
// Munge the Unicode Sets to create an initial set of character categories.
|
||||
//
|
||||
fSetBuilder.buildRanges();
|
||||
|
||||
|
@ -305,6 +303,10 @@ class RBBIRuleBuilder {
|
|||
//
|
||||
fForwardTable = new RBBITableBuilder(this, fForwardTree);
|
||||
fForwardTable.buildForwardTable();
|
||||
// State table and character category optimization.
|
||||
// Merge equivalent rows and columns.
|
||||
// Note that this process alters the the initial set of character categories,
|
||||
// causing the representation of UnicodeSets in the parse tree to become invalid.
|
||||
optimizeTables();
|
||||
fForwardTable.buildSafeReverseTable();
|
||||
|
||||
|
@ -315,7 +317,9 @@ class RBBIRuleBuilder {
|
|||
fForwardTable.printRuleStatusTable();
|
||||
fForwardTable.printReverseTable();
|
||||
}
|
||||
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
//
|
||||
fSetBuilder.buildTrie();
|
||||
//
|
||||
// Package up the compiled data, writing it to an output stream
|
||||
|
|
|
@ -29,7 +29,7 @@ import com.ibm.icu.util.MutableCodePointTrie;
|
|||
// by the RBBI rules.
|
||||
// - compute a set of non-overlapping character ranges
|
||||
// with all characters within a range belonging to the same
|
||||
// set of input uniocde sets.
|
||||
// set of input unicode sets.
|
||||
// - Derive a set of non-overlapping UnicodeSet (like things)
|
||||
// that will correspond to columns in the state table for
|
||||
// the RBBI execution engine. All characters within one
|
||||
|
@ -41,23 +41,27 @@ import com.ibm.icu.util.MutableCodePointTrie;
|
|||
//
|
||||
class RBBISetBuilder {
|
||||
static class RangeDescriptor {
|
||||
int fStartChar; // Start of range, unicode 32 bit value.
|
||||
int fEndChar; // End of range, unicode 32 bit value.
|
||||
int fNum; // runtime-mapped input value for this range.
|
||||
List<RBBINode> fIncludesSets; // vector of the the original
|
||||
// Unicode sets that include this range.
|
||||
// (Contains ptrs to uset nodes)
|
||||
RangeDescriptor fNext; // Next RangeDescriptor in the linked list.
|
||||
int fStartChar = 0; // Start of range, unicode 32 bit value.
|
||||
int fEndChar = 0; // End of range, unicode 32 bit value.
|
||||
int fNum = 0; // runtime-mapped input value for this range.
|
||||
boolean fIncludesDict = false; // True if the range includes $dictionary.
|
||||
boolean fFirstInGroup = false; // True if first range in a group with the same fNum.
|
||||
List<RBBINode> fIncludesSets; // vector of the the original
|
||||
// Unicode sets that include this range.
|
||||
// (Contains ptrs to uset nodes)
|
||||
RangeDescriptor fNext; // Next RangeDescriptor in the linked list.
|
||||
|
||||
RangeDescriptor() {
|
||||
fIncludesSets = new ArrayList<RBBINode>();
|
||||
fIncludesSets = new ArrayList<>();
|
||||
}
|
||||
|
||||
RangeDescriptor(RangeDescriptor other) {
|
||||
fStartChar = other.fStartChar;
|
||||
fEndChar = other.fEndChar;
|
||||
fNum = other.fNum;
|
||||
fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
|
||||
fIncludesDict = other.fIncludesDict;
|
||||
fFirstInGroup = other.fFirstInGroup;
|
||||
fIncludesSets = new ArrayList<>(other.fIncludesSets);
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
|
@ -82,28 +86,18 @@ class RBBISetBuilder {
|
|||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDescriptor::setDictionaryFlag
|
||||
//
|
||||
// Character Category Numbers that include characters from
|
||||
// the original Unicode Set named "dictionary" have bit 14
|
||||
// set to 1. The RBBI runtime engine uses this to trigger
|
||||
// use of the word dictionary.
|
||||
//
|
||||
// This function looks through the Unicode Sets that it
|
||||
// (the range) includes, and sets the bit in fNum when
|
||||
// "dictionary" is among them.
|
||||
//
|
||||
/**
|
||||
* Test whether this range includes characters from the original Unicode Set named "dictionary".
|
||||
*
|
||||
* This function looks through the Unicode Sets that
|
||||
* the range includes, checking for one named "dictionary"
|
||||
*/
|
||||
// TODO: a faster way would be to find the set node for
|
||||
// "dictionary" just once, rather than looking it
|
||||
// up by name every time.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
void setDictionaryFlag() {
|
||||
int i;
|
||||
|
||||
for (i=0; i<this.fIncludesSets.size(); i++) {
|
||||
boolean isDictionaryRange() {
|
||||
for (int i=0; i<this.fIncludesSets.size(); i++) {
|
||||
RBBINode usetNode = fIncludesSets.get(i);
|
||||
String setName = "";
|
||||
RBBINode setRef = usetNode.fParent;
|
||||
|
@ -114,11 +108,10 @@ class RBBISetBuilder {
|
|||
}
|
||||
}
|
||||
if (setName.equals("dictionary")) {
|
||||
this.fNum |= DICT_BIT;
|
||||
break;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -130,19 +123,18 @@ class RBBISetBuilder {
|
|||
// the Unicode Sets.
|
||||
CodePointTrie fFrozenTrie;
|
||||
|
||||
// Groups correspond to character categories -
|
||||
// groups of ranges that are in the same original UnicodeSets.
|
||||
// fGroupCount is the index of the last used group.
|
||||
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
|
||||
// State table column 0 is not used. Column 1 is for end-of-input.
|
||||
// column 2 is for group 0. Funny counting.
|
||||
/**
|
||||
* Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
|
||||
*/
|
||||
int fGroupCount;
|
||||
/**
|
||||
* The number of the first dictionary char category.
|
||||
* If there are no Dictionary categories, set to the last category + 1.
|
||||
*/
|
||||
int fDictCategoriesStart;
|
||||
|
||||
boolean fSawBOF;
|
||||
|
||||
static final int DICT_BIT = 0x4000;
|
||||
static final int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -239,25 +231,49 @@ class RBBISetBuilder {
|
|||
//
|
||||
// Numbering: # 0 (state table column 0) is unused.
|
||||
// # 1 is reserved - table column 1 is for end-of-input
|
||||
// # 2 is reserved - table column 2 is for beginning-in-input
|
||||
// # 2 is reserved - table column 2 is for beginning-of-input
|
||||
// # 3 is the first range list.
|
||||
//
|
||||
RangeDescriptor rlSearchRange;
|
||||
int dictGroupCount = 0;
|
||||
|
||||
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
|
||||
if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
|
||||
rlRange.fNum = rlSearchRange.fNum;
|
||||
rlRange.fIncludesDict = rlSearchRange.fIncludesDict;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rlRange.fNum == 0) {
|
||||
fGroupCount ++;
|
||||
rlRange.fNum = fGroupCount+2;
|
||||
rlRange.setDictionaryFlag();
|
||||
addValToSets(rlRange.fIncludesSets, fGroupCount+2);
|
||||
rlRange.fFirstInGroup = true;
|
||||
if (rlRange.isDictionaryRange()) {
|
||||
rlRange.fNum = ++dictGroupCount;
|
||||
rlRange.fIncludesDict = true;
|
||||
} else {
|
||||
fGroupCount++;
|
||||
rlRange.fNum = fGroupCount + 2;
|
||||
addValToSets(rlRange.fIncludesSets, fGroupCount + 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Move the character category numbers for any dictionary ranges up, so that they
|
||||
// immediately follow the non-dictionary ranges.
|
||||
|
||||
fDictCategoriesStart = fGroupCount + 3;
|
||||
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
if (rlRange.fIncludesDict) {
|
||||
rlRange.fNum += fDictCategoriesStart - 1;
|
||||
if (rlRange.fFirstInGroup) {
|
||||
addValToSets(rlRange.fIncludesSets, rlRange.fNum);
|
||||
}
|
||||
}
|
||||
}
|
||||
fGroupCount += dictGroupCount;
|
||||
|
||||
|
||||
|
||||
// Handle input sets that contain the special string {eof}.
|
||||
// Column 1 of the state table is reserved for EOF on input.
|
||||
// Column 2 is reserved for before-the-start-input.
|
||||
|
@ -288,31 +304,21 @@ class RBBISetBuilder {
|
|||
}
|
||||
|
||||
|
||||
private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 127;
|
||||
private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 255;
|
||||
|
||||
/**
|
||||
* Build the Trie table for mapping UChar32 values to the corresponding
|
||||
* range group number.
|
||||
*/
|
||||
void buildTrie() {
|
||||
boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
|
||||
RangeDescriptor rlRange;
|
||||
|
||||
fTrie = new MutableCodePointTrie(0, // Initial value for all code points.
|
||||
0); // Error value for out-of-range input.
|
||||
|
||||
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
int value = rlRange.fNum;
|
||||
if (use8Bits && ((value & DICT_BIT) != 0)) {
|
||||
assert((value & DICT_BIT_FOR_8BITS_TRIE) == 0);
|
||||
// switch to the bit from DICT_BIT to DICT_BIT_FOR_8BITS_TRIE
|
||||
value = DICT_BIT_FOR_8BITS_TRIE | (value & ~DICT_BIT);
|
||||
}
|
||||
fTrie.setRange(
|
||||
rlRange.fStartChar, // Range start
|
||||
rlRange.fEndChar, // Range end (inclusive)
|
||||
value // value for range
|
||||
);
|
||||
for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
fTrie.setRange(rlRange.fStartChar, // Range start
|
||||
rlRange.fEndChar, // Range end (inclusive)
|
||||
rlRange.fNum // value for range
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -324,16 +330,20 @@ class RBBISetBuilder {
|
|||
void mergeCategories(IntPair categories) {
|
||||
assert(categories.first >= 1);
|
||||
assert(categories.second > categories.first);
|
||||
assert((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) ||
|
||||
(categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
|
||||
for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) {
|
||||
int rangeNum = rd.fNum & ~DICT_BIT;
|
||||
int rangeDict = rd.fNum & DICT_BIT;
|
||||
int rangeNum = rd.fNum;
|
||||
if (rangeNum == categories.second) {
|
||||
rd.fNum = categories.first | rangeDict;
|
||||
rd.fNum = categories.first;
|
||||
} else if (rangeNum > categories.second) {
|
||||
rd.fNum--;
|
||||
}
|
||||
}
|
||||
--fGroupCount;
|
||||
if (categories.second <= fDictCategoriesStart) {
|
||||
--fDictCategoriesStart;
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
|
@ -425,6 +435,16 @@ class RBBISetBuilder {
|
|||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// getDictCategoriesStart
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
int getDictCategoriesStart() {
|
||||
return fDictCategoriesStart;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// sawBOF
|
||||
|
@ -454,7 +474,6 @@ class RBBISetBuilder {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// printRanges A debugging function.
|
||||
|
@ -468,7 +487,7 @@ class RBBISetBuilder {
|
|||
|
||||
System.out.print("\n\n Nonoverlapping Ranges ...\n");
|
||||
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
System.out.print(" " + rlRange.fNum + " " + rlRange.fStartChar + "-" + rlRange.fEndChar);
|
||||
System.out.printf("%04x-%04x ", rlRange.fStartChar, rlRange.fEndChar);
|
||||
|
||||
for (i=0; i<rlRange.fIncludesSets.size(); i++) {
|
||||
RBBINode usetNode = rlRange.fIncludesSets.get(i);
|
||||
|
@ -496,20 +515,16 @@ class RBBISetBuilder {
|
|||
//------------------------------------------------------------------------
|
||||
///CLOVER:OFF
|
||||
void printRangeGroups() {
|
||||
RangeDescriptor rlRange;
|
||||
RangeDescriptor tRange;
|
||||
int i;
|
||||
int lastPrintedGroupNum = 0;
|
||||
|
||||
System.out.print("\nRanges grouped by Unicode Set Membership...\n");
|
||||
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
int groupNum = rlRange.fNum & 0xbfff;
|
||||
if (groupNum > lastPrintedGroupNum) {
|
||||
lastPrintedGroupNum = groupNum;
|
||||
for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
if (rlRange.fFirstInGroup) {
|
||||
int groupNum = rlRange.fNum;
|
||||
if (groupNum<10) {System.out.print(" ");}
|
||||
System.out.print(groupNum + " ");
|
||||
|
||||
if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" <DICT> ");}
|
||||
if (groupNum >= fDictCategoriesStart) { System.out.print(" <DICT> ");}
|
||||
|
||||
for (i=0; i<rlRange.fIncludesSets.size(); i++) {
|
||||
RBBINode usetNode = rlRange.fIncludesSets.get(i);
|
||||
|
@ -525,7 +540,7 @@ class RBBISetBuilder {
|
|||
}
|
||||
|
||||
i = 0;
|
||||
for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
|
||||
for (RangeDescriptor tRange = rlRange; tRange != null; tRange = tRange.fNext) {
|
||||
if (tRange.fNum == rlRange.fNum) {
|
||||
if (i++ % 5 == 0) {
|
||||
System.out.print("\n ");
|
||||
|
|
|
@ -905,7 +905,13 @@ class RBBITableBuilder {
|
|||
int table_base = 0;
|
||||
int table_dupl = 0;
|
||||
for (; categories.first < numCols-1; ++categories.first) {
|
||||
for (categories.second=categories.first+1; categories.second < numCols; ++categories.second) {
|
||||
// Note: dictionary & non-dictionary columns cannot be merged.
|
||||
// The limitSecond value prevents considering mixed pairs.
|
||||
// Dictionary categories are >= DictCategoriesStart.
|
||||
// Non dict categories are < DictCategoriesStart.
|
||||
int limitSecond = categories.first < fRB.fSetBuilder.getDictCategoriesStart() ?
|
||||
fRB.fSetBuilder.getDictCategoriesStart() : numCols;
|
||||
for (categories.second=categories.first+1; categories.second < limitSecond; ++categories.second) {
|
||||
for (int state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
table_base = sd.fDtran[categories.first];
|
||||
|
@ -1103,7 +1109,7 @@ class RBBITableBuilder {
|
|||
if (fRB.fTreeRoots[fRootIx] == null) {
|
||||
return 0;
|
||||
}
|
||||
int size = 12; // The header of 4 ints, with no rows to the table.
|
||||
int size = RBBIDataWrapper.RBBIStateTable.fHeaderSize; // The header, with no rows to the table.
|
||||
int numRows = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
|
||||
|
@ -1132,17 +1138,18 @@ class RBBITableBuilder {
|
|||
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
|
||||
fDStates.size() < 0x7fff);
|
||||
table.fNumStates = fDStates.size();
|
||||
table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart();
|
||||
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
|
||||
|
||||
// Size of table size in shorts.
|
||||
int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts.
|
||||
int tableSize;
|
||||
if (use8Bits) {
|
||||
tableSize = (getTableSize() - 12); // fTable length in bytes.
|
||||
tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize); // fTable length in bytes.
|
||||
table.fTable = new char[tableSize];
|
||||
table.fRowLen = rowLen; // Row length in bytes.
|
||||
} else {
|
||||
tableSize = (getTableSize() - 12) / 2; // fTable length in shorts.
|
||||
tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize) / 2; // fTable length in shorts.
|
||||
table.fTable = new char[tableSize];
|
||||
table.fRowLen = rowLen * 2; // Row length in bytes.
|
||||
}
|
||||
|
@ -1275,7 +1282,7 @@ class RBBITableBuilder {
|
|||
if (fSafeTable == null) {
|
||||
return 0;
|
||||
}
|
||||
int size = 12; // The header of 4 ints, with no rows to the table.
|
||||
int size = RBBIDataWrapper.RBBIStateTable.fHeaderSize; // The header, with no rows to the table.
|
||||
int numRows = fSafeTable.size();
|
||||
int numCols = fSafeTable.get(0).length;
|
||||
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
|
||||
|
@ -1303,7 +1310,7 @@ class RBBITableBuilder {
|
|||
int rowLen = RBBIDataWrapper.NEXTSTATES + numCharCategories;
|
||||
// TODO: tableSize is basically numStates * numCharCategories,
|
||||
// except for alignment padding. Clean up here, and in main exportTable().
|
||||
int tableSize = (getSafeTableSize() - 12); // fTable length in bytes.
|
||||
int tableSize = (getSafeTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize); // fTable length in bytes.
|
||||
if (use8Bits) {
|
||||
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
|
||||
table.fTable = new char[tableSize];
|
||||
|
@ -1357,12 +1364,12 @@ class RBBITableBuilder {
|
|||
System.out.print("state | i n p u t s y m b o l s \n");
|
||||
System.out.print(" | Acc LA Tag");
|
||||
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
|
||||
RBBINode.printInt(c, 3);
|
||||
RBBINode.printInt(c, 4);
|
||||
}
|
||||
System.out.print("\n");
|
||||
System.out.print(" |---------------");
|
||||
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
|
||||
System.out.print("---");
|
||||
System.out.print("----");
|
||||
}
|
||||
System.out.print("\n");
|
||||
|
||||
|
@ -1376,7 +1383,7 @@ class RBBITableBuilder {
|
|||
RBBINode.printInt(sd.fTagsIdx, 6);
|
||||
System.out.print(" ");
|
||||
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
|
||||
RBBINode.printInt(sd.fDtran[c], 3);
|
||||
RBBINode.printInt(sd.fDtran[c], 4);
|
||||
}
|
||||
System.out.print("\n");
|
||||
}
|
||||
|
|
|
@ -843,9 +843,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
int row = fRData.getRowIndex(state);
|
||||
short category = 3;
|
||||
int flagsState = fRData.fFTable.fFlags;
|
||||
int dictStart = fRData.fFTable.fDictCategoriesStart;
|
||||
int mode = RBBI_RUN;
|
||||
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
|
||||
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
|
||||
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
|
||||
category = 2;
|
||||
mode = RBBI_START;
|
||||
|
@ -882,15 +881,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
//
|
||||
category = (short) trie.get(c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & dictMask) != 0) {
|
||||
// Check for categories that require word dictionary handling.
|
||||
if (category >= dictStart) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~dictMask;
|
||||
}
|
||||
|
||||
if (TRACE) {
|
||||
|
@ -1004,9 +997,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
CharacterIterator text = fText;
|
||||
CodePointTrie trie = fRData.fTrie;
|
||||
char[] stateTable = fRData.fRTable.fTable;
|
||||
int flagsState = fRData.fRTable.fFlags;
|
||||
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
|
||||
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
|
||||
|
||||
CISetIndex32(text, fromPosition);
|
||||
if (TRACE) {
|
||||
|
@ -1032,7 +1022,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
//
|
||||
// And off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = (short) trie.get(c);
|
||||
category &= ~dictMask;
|
||||
if (TRACE) {
|
||||
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||||
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
|
||||
|
@ -1212,8 +1201,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
int category;
|
||||
int current;
|
||||
int foundBreakCount = 0;
|
||||
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
|
||||
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
|
||||
|
||||
// Loop through the text, looking for ranges of dictionary characters.
|
||||
// For each span, find the appropriate break engine, and ask it to find
|
||||
|
@ -1222,9 +1209,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
fText.setIndex(rangeStart);
|
||||
int c = CharacterIteration.current32(fText);
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
int dictStart = fRData.fFTable.fDictCategoriesStart;
|
||||
|
||||
while(true) {
|
||||
while((current = fText.getIndex()) < rangeEnd && (category & dictMask) == 0) {
|
||||
while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
|
||||
c = CharacterIteration.next32(fText); // pre-increment
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
}
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:bdf00a19b05bc52e17c2aea74e87cc1872a824d5a9cced226078c46a194a8799
|
||||
size 13141762
|
||||
oid sha256:53e4c3251f31233ffcfe3ff4229ea43d81422a3fa071ee774ed835e5e969d22c
|
||||
size 13142859
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:6d2882ccb44134313ff0365eb24776d4e859fa9dd223f10d608d65fdfd7f23d9
|
||||
oid sha256:72b712d8d19a5aa8d1cb36f070337010c29595c63d917cf81e3213a5ea5be2e7
|
||||
size 94529
|
||||
|
|
|
@ -408,7 +408,7 @@ public class RBBITest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
List<Thread> threads = new ArrayList<Thread>();
|
||||
List<Thread> threads = new ArrayList<>();
|
||||
for (int n = 0; n<4; ++n) {
|
||||
threads.add(new Thread(new WorkerThread()));
|
||||
}
|
||||
|
@ -513,7 +513,7 @@ public class RBBITest extends TestFmwk {
|
|||
}
|
||||
private static final BreakIterator BREAK_ITERATOR_CACHE = BreakIterator.getWordInstance(ULocale.ROOT);
|
||||
public static List<Integer> getBoundary(String toParse) {
|
||||
List<Integer> retVal = new ArrayList<Integer>();
|
||||
List<Integer> retVal = new ArrayList<>();
|
||||
BreakIterator bi = (BreakIterator) BREAK_ITERATOR_CACHE.clone();
|
||||
bi.setText(toParse);
|
||||
for (int boundary=bi.first(); boundary != BreakIterator.DONE; boundary = bi.next()) {
|
||||
|
@ -579,19 +579,20 @@ public class RBBITest extends TestFmwk {
|
|||
int numCharClasses = dw.fHeader.fCatCount;
|
||||
|
||||
// Check for duplicate columns (character categories)
|
||||
List<String> columns = new ArrayList<String>();
|
||||
List<String> columns = new ArrayList<>();
|
||||
for (int column=0; column<numCharClasses; column++) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int r = 1; r < fwtbl.fNumStates; r++) {
|
||||
int row = dw.getRowIndex(r);
|
||||
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
|
||||
s.append((char)tableVal);
|
||||
s.append(tableVal);
|
||||
}
|
||||
columns.add(s.toString());
|
||||
}
|
||||
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
|
||||
for (int c1=1; c1<numCharClasses; c1++) {
|
||||
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
|
||||
int limit = c1 < fwtbl.fDictCategoriesStart ? fwtbl.fDictCategoriesStart : numCharClasses;
|
||||
for (int c2 = c1+1; c2 < limit; c2++) {
|
||||
assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
|
||||
// if (columns.get(c1).equals(columns.get(c2))) {
|
||||
// System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
|
||||
|
@ -600,7 +601,7 @@ public class RBBITest extends TestFmwk {
|
|||
}
|
||||
|
||||
// Check for duplicate states.
|
||||
List<String> rows = new ArrayList<String>();
|
||||
List<String> rows = new ArrayList<>();
|
||||
for (int r=0; r<fwtbl.fNumStates; r++) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
int row = dw.getRowIndex(r);
|
||||
|
@ -643,7 +644,7 @@ public class RBBITest extends TestFmwk {
|
|||
public void TestTableRebuild() {
|
||||
// Test to verify that rebuilding the state tables from rule source for the standard
|
||||
// break iterator types yields the same tables as are imported from ICU4C as part of the default data.
|
||||
List<RuleBasedBreakIterator> breakIterators = new ArrayList<RuleBasedBreakIterator>();
|
||||
List<RuleBasedBreakIterator> breakIterators = new ArrayList<>();
|
||||
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getCharacterInstance(Locale.ENGLISH));
|
||||
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getWordInstance(Locale.ENGLISH));
|
||||
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getSentenceInstance(Locale.ENGLISH));
|
||||
|
@ -723,17 +724,17 @@ public class RBBITest extends TestFmwk {
|
|||
|
||||
@Test
|
||||
public void Test8BitsTrieWith8BitStateTable() {
|
||||
testTrieStateTable(123, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
|
||||
testTrieStateTable(251, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void Test16BitsTrieWith8BitStateTable() {
|
||||
testTrieStateTable(124, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
|
||||
testTrieStateTable(252, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void Test16BitsTrieWith16BitStateTable() {
|
||||
testTrieStateTable(255, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
|
||||
testTrieStateTable(253, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Add table
Reference in a new issue