ICU-13565 Break Iteration, remove the dictionary bit from the implementation.

For identifying text that needs to be handled by a word dictionary for Break Iteration,
change from using a bit in the character category to sorting all dictionary categories
together, and recording the boundary between the non-dictionary and dictionary ranges.

This is internal to the implementaion. It does not affect behavior.
It does increase the number of character categories that can be handled using a
compact 8 bit Trie, from 127 to 255.
This commit is contained in:
Andy Heninger 2020-06-09 13:19:17 -07:00
parent 85aee40cc3
commit 1eef362329
17 changed files with 326 additions and 307 deletions

View file

@ -763,15 +763,15 @@ int32_t RuleBasedBreakIterator::handleNext() {
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
if (statetable->fFlags & RBBI_8BITS_ROWS) {
if (use8BitsTrie) {
return handleNext<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>();
return handleNext<RBBIStateTableRow8, TrieFunc8>();
} else {
return handleNext<RBBIStateTableRow8, TrieFunc16, kDictBit>();
return handleNext<RBBIStateTableRow8, TrieFunc16>();
}
} else {
if (use8BitsTrie) {
return handleNext<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>();
return handleNext<RBBIStateTableRow16, TrieFunc8>();
} else {
return handleNext<RBBIStateTableRow16, TrieFunc16, kDictBit>();
return handleNext<RBBIStateTableRow16, TrieFunc16>();
}
}
}
@ -781,15 +781,15 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8;
if (statetable->fFlags & RBBI_8BITS_ROWS) {
if (use8BitsTrie) {
return handleSafePrevious<RBBIStateTableRow8, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
return handleSafePrevious<RBBIStateTableRow8, TrieFunc8>(fromPosition);
} else {
return handleSafePrevious<RBBIStateTableRow8, TrieFunc16, kDictBit>(fromPosition);
return handleSafePrevious<RBBIStateTableRow8, TrieFunc16>(fromPosition);
}
} else {
if (use8BitsTrie) {
return handleSafePrevious<RBBIStateTableRow16, TrieFunc8, kDictBitFor8BitsTrie>(fromPosition);
return handleSafePrevious<RBBIStateTableRow16, TrieFunc8>(fromPosition);
} else {
return handleSafePrevious<RBBIStateTableRow16, TrieFunc16, kDictBit>(fromPosition);
return handleSafePrevious<RBBIStateTableRow16, TrieFunc16>(fromPosition);
}
}
}
@ -801,7 +801,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
// Run the state machine to find a boundary
//
//-----------------------------------------------------------------------------------
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
int32_t RuleBasedBreakIterator::handleNext() {
int32_t state;
uint16_t category = 0;
@ -815,6 +815,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
const RBBIStateTable *statetable = fData->fForwardTable;
const char *tableData = statetable->fTableData;
uint32_t tableRowLen = statetable->fRowLen;
uint32_t dictStart = statetable->fDictCategoriesStart;
#ifdef RBBI_DEBUG
if (gTrace) {
RBBIDebugPuts("Handle Next pos char state category");
@ -876,17 +877,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
// look up the current character's character category, which tells us
// which column in the state table to look at.
category = trieFunc(fData->fTrie, c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iteration.
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & dictMask) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~dictMask;
}
fDictionaryCharCount += (category >= dictStart);
}
#ifdef RBBI_DEBUG
@ -993,7 +984,7 @@ int32_t RuleBasedBreakIterator::handleNext() {
// because the safe table does not require as many options.
//
//-----------------------------------------------------------------------------------
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc, uint16_t dictMask>
template <typename RowType, RuleBasedBreakIterator::PTrieFunc trieFunc>
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
int32_t state;
@ -1030,7 +1021,6 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
//
// Off the dictionary flag bit. For reverse iteration it is not used.
category = trieFunc(fData->fTrie, c);
category &= ~dictMask;
#ifdef RBBI_DEBUG
if (gTrace) {

View file

@ -119,8 +119,6 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_
void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos,
int32_t firstRuleStatus, int32_t otherRuleStatus) {
uint32_t dictMask = ucptrie_getValueWidth(fBI->fData->fTrie) == UCPTRIE_VALUE_BITS_8 ?
kDictBitFor8BitsTrie : kDictBit;
if ((endPos - startPos) <= 1) {
return;
}
@ -145,9 +143,11 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
utext_setNativeIndex(text, rangeStart);
UChar32 c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);
uint32_t dictStart = fBI->fData->fForwardTable->fDictCategoriesStart;
while(U_SUCCESS(status)) {
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & dictMask) == 0) {
while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd
&& (category < dictStart)) {
utext_next32(text); // TODO: cleaner loop structure.
c = utext_current32(text);
category = ucptrie_get(fBI->fData->fTrie, c);

View file

@ -101,18 +101,18 @@ struct RBBIStateTableRowT {
// Value 0: not an accepting state.
// 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state.
// >1: Look-ahead match has completed.
// Actual boundary position happened earlier
// Actual boundary position happened earlier.
// Value here == fLookAhead in earlier
// state, at actual boundary pos.
// state, at actual boundary pos.
T fLookAhead; // Non-zero if this row is for a state that
// corresponds to a '/' in the rule source.
// Value is the same as the fAccepting
// value for the rule (which will appear
// in a different state.
// value for the rule (which will appear
// in a different state.
T fTagsIdx; // Non-zero if this row covers a {tagged} position
// from a rule. Value is the index in the
// StatusTable of the set of matching
// tags (rule status values)
// from a rule. Value is the index in the
// StatusTable of the set of matching
// tags (rule status values)
T fNextState[1]; // Next State, indexed by char category.
// Variable-length array declared with length 1
// to disable bounds checkers.
@ -132,14 +132,17 @@ union RBBIStateTableRow {
};
struct RBBIStateTable {
uint32_t fNumStates; /* Number of states. */
uint32_t fRowLen; /* Length of a state table row, in bytes. */
uint32_t fFlags; /* Option Flags for this state table */
char fTableData[1]; /* First RBBIStateTableRow begins here. */
/* Variable-length array declared with length 1 */
/* to disable bounds checkers. */
/* (making it char[] simplifies ugly address */
/* arithmetic for indexing variable length rows.) */
uint32_t fNumStates; // Number of states.
uint32_t fRowLen; // Length of a state table row, in bytes.
uint32_t fDictCategoriesStart; // Char category number of the first dictionary
// char class, or the the largest category number + 1
// if there are no dictionary categories.
uint32_t fFlags; // Option Flags for this state table.
char fTableData[1]; // First RBBIStateTableRow begins here.
// Variable-length array declared with length 1
// to disable bounds checkers.
// (making it char[] simplifies ugly address
// arithmetic for indexing variable length rows.)
};
constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1;

View file

@ -287,9 +287,7 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
// Munge the Unicode Sets to create an initial set of character categories.
//
fSetBuilder->buildRanges();
@ -303,6 +301,12 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
}
fForwardTable->buildForwardTable();
// State table and character category optimization.
// Merge equivalent rows and columns.
// Note that this process alters the initial set of character categories,
// causing the representation of UnicodeSets in the parse tree to become invalid.
optimizeTables();
fForwardTable->buildSafeReverseTable(status);
@ -315,6 +319,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
}
#endif
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
fSetBuilder->buildTrie();
//

View file

@ -19,7 +19,7 @@
// by the RBBI rules.
// - compute a set of non-overlapping character ranges
// with all characters within a range belonging to the same
// set of input uniocde sets.
// set of input unicode sets.
// - Derive a set of non-overlapping UnicodeSet (like things)
// that will correspond to columns in the state table for
// the RBBI execution engine. All characters within one
@ -45,7 +45,7 @@
U_NAMESPACE_BEGIN
const int32_t kMaxCharCategoriesFor8BitsTrie = 127;
const int32_t kMaxCharCategoriesFor8BitsTrie = 255;
//------------------------------------------------------------------------
//
// Constructor
@ -55,12 +55,12 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
{
fRB = rb;
fStatus = rb->fStatus;
fRangeList = 0;
fRangeList = nullptr;
fMutableTrie = nullptr;
fTrie = nullptr;
fTrieSize = 0;
fGroupCount = 0;
fSawBOF = FALSE;
fSawBOF = false;
}
@ -196,25 +196,48 @@ void RBBISetBuilder::buildRanges() {
//
// Numbering: # 0 (state table column 0) is unused.
// # 1 is reserved - table column 1 is for end-of-input
// # 2 is reserved - table column 2 is for beginning-in-input
// # 2 is reserved - table column 2 is for beginning-of-input
// # 3 is the first range list.
//
RangeDescriptor *rlSearchRange;
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
int32_t dictGroupCount = 0;
for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
rlRange->fNum = rlSearchRange->fNum;
rlRange->fIncludesDict = rlSearchRange->fIncludesDict;
break;
}
}
if (rlRange->fNum == 0) {
fGroupCount ++;
rlRange->fNum = fGroupCount+2;
rlRange->setDictionaryFlag();
addValToSets(rlRange->fIncludesSets, fGroupCount+2);
rlRange->fFirstInGroup = true;
if (rlRange->isDictionaryRange()) {
rlRange->fNum = ++dictGroupCount;
rlRange->fIncludesDict = true;
} else {
fGroupCount++;
rlRange->fNum = fGroupCount+2;
addValToSets(rlRange->fIncludesSets, rlRange->fNum);
}
}
}
// Move the character category numbers for any dictionary ranges up, so that they
// immediately follow the non-dictionary ranges.
fDictCategoriesStart = fGroupCount + 3;
for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
if (rlRange->fIncludesDict) {
rlRange->fNum += fDictCategoriesStart - 1;
if (rlRange->fFirstInGroup) {
addValToSets(rlRange->fIncludesSets, rlRange->fNum);
}
}
}
fGroupCount += dictGroupCount;
// Handle input sets that contain the special string {eof}.
// Column 1 of the state table is reserved for EOF on input.
// Column 2 is reserved for before-the-start-input.
@ -222,13 +245,11 @@ void RBBISetBuilder::buildRanges() {
// references to {bof}.)
// Add this column value (1 or 2) to the equivalent expression
// subtree for each UnicodeSet that contains the string {eof}
// Because {bof} and {eof} are not a characters in the normal sense,
// they doesn't affect the computation of ranges or TRIE.
static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
// Because {bof} and {eof} are not characters in the normal sense,
// they don't affect the computation of the ranges or TRIE.
UnicodeString eofString(eofUString);
UnicodeString bofString(bofUString);
UnicodeString eofString(u"eof");
UnicodeString bofString(u"bof");
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
if (usetNode==NULL) {
@ -255,24 +276,16 @@ void RBBISetBuilder::buildRanges() {
// range group number.
//
void RBBISetBuilder::buildTrie() {
RangeDescriptor *rlRange;
fMutableTrie = umutablecptrie_open(
0, // Initial value for all code points.
0, // Error value for out-of-range input.
fStatus);
bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie;
for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) {
uint32_t value = rlRange->fNum;
if (use8Bits && ((value & RuleBasedBreakIterator::kDictBit) != 0)) {
U_ASSERT((value & RuleBasedBreakIterator::kDictBitFor8BitsTrie) == 0);
value = RuleBasedBreakIterator::kDictBitFor8BitsTrie | (value & ~RuleBasedBreakIterator::kDictBit);
}
for (RangeDescriptor *range = fRangeList; range!=nullptr && U_SUCCESS(*fStatus); range=range->fNext) {
umutablecptrie_setRange(fMutableTrie,
rlRange->fStartChar, // Range start
rlRange->fEndChar, // Range end (inclusive)
value, // value for range
range->fStartChar, // Range start
range->fEndChar, // Range end (inclusive)
range->fNum, // value for range
fStatus);
}
}
@ -281,16 +294,21 @@ void RBBISetBuilder::buildTrie() {
void RBBISetBuilder::mergeCategories(IntPair categories) {
U_ASSERT(categories.first >= 1);
U_ASSERT(categories.second > categories.first);
U_ASSERT((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) ||
(categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
int32_t rangeNum = rd->fNum & ~RuleBasedBreakIterator::kDictBit;
int32_t rangeDict = rd->fNum & RuleBasedBreakIterator::kDictBit;
int32_t rangeNum = rd->fNum;
if (rangeNum == categories.second) {
rd->fNum = categories.first | rangeDict;
rd->fNum = categories.first;
} else if (rangeNum > categories.second) {
rd->fNum--;
}
}
--fGroupCount;
if (categories.second <= fDictCategoriesStart) {
--fDictCategoriesStart;
}
}
@ -395,6 +413,16 @@ int32_t RBBISetBuilder::getNumCharCategories() const {
}
//------------------------------------------------------------------------
//
// getDictCategoriesStart
//
//------------------------------------------------------------------------
int32_t RBBISetBuilder::getDictCategoriesStart() const {
return fDictCategoriesStart;
}
//------------------------------------------------------------------------
//
// sawBOF
@ -414,7 +442,7 @@ UBool RBBISetBuilder::sawBOF() const {
UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
RangeDescriptor *rlRange;
UChar32 retVal = (UChar32)-1;
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
if (rlRange->fNum == category) {
retVal = rlRange->fStartChar;
break;
@ -424,7 +452,6 @@ UChar32 RBBISetBuilder::getFirstChar(int32_t category) const {
}
//------------------------------------------------------------------------
//
// printRanges A debugging function.
@ -437,16 +464,16 @@ void RBBISetBuilder::printRanges() {
int i;
RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
RBBIDebugPrintf("%2i %4x-%4x ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
for (rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
RBBIDebugPrintf("%4x-%4x ", rlRange->fStartChar, rlRange->fEndChar);
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
UnicodeString setName = UNICODE_STRING("anon", 4);
UnicodeString setName {u"anon"};
RBBINode *setRef = usetNode->fParent;
if (setRef != NULL) {
if (setRef != nullptr) {
RBBINode *varRef = setRef->fParent;
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
if (varRef != nullptr && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
}
}
@ -466,19 +493,15 @@ void RBBISetBuilder::printRanges() {
//------------------------------------------------------------------------
#ifdef RBBI_DEBUG
void RBBISetBuilder::printRangeGroups() {
RangeDescriptor *rlRange;
RangeDescriptor *tRange;
int i;
int lastPrintedGroupNum = 0;
RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
int groupNum = rlRange->fNum & 0xbfff;
if (groupNum > lastPrintedGroupNum) {
lastPrintedGroupNum = groupNum;
for (RangeDescriptor *rlRange = fRangeList; rlRange!=nullptr; rlRange=rlRange->fNext) {
if (rlRange->fFirstInGroup) {
int groupNum = rlRange->fNum;
RBBIDebugPrintf("%2i ", groupNum);
if (rlRange->fNum & RuleBasedBreakIterator::kDictBit) { RBBIDebugPrintf(" <DICT> ");}
if (groupNum >= fDictCategoriesStart) { RBBIDebugPrintf(" <DICT> ");}
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
@ -494,7 +517,7 @@ void RBBISetBuilder::printRangeGroups() {
}
i = 0;
for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
for (RangeDescriptor *tRange = rlRange; tRange != nullptr; tRange = tRange->fNext) {
if (tRange->fNum == rlRange->fNum) {
if (i++ % 5 == 0) {
RBBIDebugPrintf("\n ");
@ -561,28 +584,22 @@ void RBBISetBuilder::printSets() {
//
//-------------------------------------------------------------------------------------
RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
int i;
RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) :
fStartChar(other.fStartChar), fEndChar {other.fEndChar}, fNum {other.fNum},
fIncludesDict{other.fIncludesDict}, fFirstInGroup{other.fFirstInGroup} {
this->fStartChar = other.fStartChar;
this->fEndChar = other.fEndChar;
this->fNum = other.fNum;
this->fNext = NULL;
UErrorCode oldstatus = status;
this->fIncludesSets = new UVector(status);
if (U_FAILURE(oldstatus)) {
status = oldstatus;
if (U_FAILURE(status)) {
return;
}
fIncludesSets = new UVector(status);
if (this->fIncludesSets == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) {
return;
}
/* test for NULL */
if (this->fIncludesSets == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
for (i=0; i<other.fIncludesSets->size(); i++) {
for (int32_t i=0; i<other.fIncludesSets->size(); i++) {
this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
}
}
@ -594,24 +611,13 @@ RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &statu
//
//-------------------------------------------------------------------------------------
RangeDescriptor::RangeDescriptor(UErrorCode &status) {
this->fStartChar = 0;
this->fEndChar = 0;
this->fNum = 0;
this->fNext = NULL;
UErrorCode oldstatus = status;
this->fIncludesSets = new UVector(status);
if (U_FAILURE(oldstatus)) {
status = oldstatus;
}
if (U_FAILURE(status)) {
return;
}
/* test for NULL */
if(this->fIncludesSets == 0) {
fIncludesSets = new UVector(status);
if (fIncludesSets == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
@ -622,7 +628,7 @@ RangeDescriptor::RangeDescriptor(UErrorCode &status) {
//-------------------------------------------------------------------------------------
RangeDescriptor::~RangeDescriptor() {
delete fIncludesSets;
fIncludesSets = NULL;
fIncludesSets = nullptr;
}
//-------------------------------------------------------------------------------------
@ -633,7 +639,7 @@ RangeDescriptor::~RangeDescriptor() {
void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
U_ASSERT(where>fStartChar && where<=fEndChar);
RangeDescriptor *nr = new RangeDescriptor(*this, status);
if(nr == 0) {
if(nr == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
@ -652,27 +658,22 @@ void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
//-------------------------------------------------------------------------------------
//
// RangeDescriptor::setDictionaryFlag
// RangeDescriptor::isDictionaryRange
//
// Character Category Numbers that include characters from
// the original Unicode Set named "dictionary" have bit 14
// set to 1. The RBBI runtime engine uses this to trigger
// use of the word dictionary.
// Test whether this range includes characters from
// the original Unicode Set named "dictionary".
//
// This function looks through the Unicode Sets that it
// (the range) includes, and sets the bit in fNum when
// "dictionary" is among them.
// This function looks through the Unicode Sets that
// the range includes, checking for one named "dictionary"
//
// TODO: a faster way would be to find the set node for
// "dictionary" just once, rather than looking it
// up by name every time.
//
//-------------------------------------------------------------------------------------
void RangeDescriptor::setDictionaryFlag() {
int i;
bool RangeDescriptor::isDictionaryRange() {
static const char16_t *dictionary = u"dictionary";
for (i=0; i<fIncludesSets->size(); i++) {
for (int32_t i=0; i<fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
RBBINode *setRef = usetNode->fParent;
if (setRef != nullptr) {
@ -680,16 +681,14 @@ void RangeDescriptor::setDictionaryFlag() {
if (varRef && varRef->fType == RBBINode::varRef) {
const UnicodeString *setName = &varRef->fText;
if (setName->compare(dictionary, -1) == 0) {
fNum |= RuleBasedBreakIterator::kDictBit;
break;
return true;
}
}
}
}
return false;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -41,25 +41,26 @@ U_NAMESPACE_BEGIN
//
class RangeDescriptor : public UMemory {
public:
UChar32 fStartChar; // Start of range, unicode 32 bit value.
UChar32 fEndChar; // End of range, unicode 32 bit value.
int32_t fNum; // runtime-mapped input value for this range.
UVector *fIncludesSets; // vector of the the original
// Unicode sets that include this range.
// (Contains ptrs to uset nodes)
RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
UChar32 fStartChar {}; // Start of range, unicode 32 bit value.
UChar32 fEndChar {}; // End of range, unicode 32 bit value.
int32_t fNum {0}; // runtime-mapped input value for this range.
bool fIncludesDict {false}; // True if the range includes $dictionary.
bool fFirstInGroup {false}; // True if first range in a group with the same fNum.
UVector *fIncludesSets {nullptr}; // vector of the the original
// Unicode sets that include this range.
// (Contains ptrs to uset nodes)
RangeDescriptor *fNext {nullptr}; // Next RangeDescriptor in the linked list.
RangeDescriptor(UErrorCode &status);
RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
~RangeDescriptor();
void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
// where appearing in the second (higher) part.
void setDictionaryFlag(); // Check whether this range appears as part of
bool isDictionaryRange(); // Check whether this range appears as part of
// the Unicode set named "dictionary"
private:
RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
RangeDescriptor(const RangeDescriptor &other) = delete; // forbid default copying of this class
RangeDescriptor &operator=(const RangeDescriptor &other) = delete; // forbid assigning of this class
};
@ -90,6 +91,8 @@ public:
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
// runtime state machine, which are the same as
// columns in the DFA state table
int32_t getDictCategoriesStart() const; // First char category that includes $dictionary, or
// last category + 1 if there are no dictionary categories.
int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
void serializeTrie(uint8_t *where); // write out the serialized Trie.
UChar32 getFirstChar(int32_t val) const;
@ -113,8 +116,6 @@ public:
#endif
private:
void numberSets();
RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
UErrorCode *fStatus;
@ -124,14 +125,13 @@ private:
UCPTrie *fTrie; // the Unicode Sets.
uint32_t fTrieSize;
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.
// fGroupCount is the index of the last used group.
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
// State table column 0 is not used. Column 1 is for end-of-input.
// column 2 is for group 0. Funny counting.
// Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
int32_t fGroupCount;
// The number of the first dictionary char category.
// If there are no Dictionary categories, set to the last category + 1.
int32_t fDictCategoriesStart;
UBool fSawBOF;
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class

View file

@ -1155,7 +1155,13 @@ bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
for (; categories->first < numCols-1; categories->first++) {
for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
// Note: dictionary & non-dictionary columns cannot be merged.
// The limitSecond value prevents considering mixed pairs.
// Dictionary categories are >= DictCategoriesStart.
// Non dict categories are < DictCategoriesStart.
int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
fRB->fSetBuilder->getDictCategoriesStart() : numCols;
for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
// Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
uint16_t table_base = 0;
uint16_t table_dupl = 1;
@ -1379,6 +1385,7 @@ void RBBITableBuilder::exportTable(void *where) {
}
table->fNumStates = fDStates->size();
table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
table->fFlags = 0;
if (use8BitsForTable()) {
table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
@ -1652,12 +1659,12 @@ void RBBITableBuilder::printStates() {
RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
RBBIDebugPrintf(" | Acc LA Tag");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf(" %2d", c);
RBBIDebugPrintf(" %3d", c);
}
RBBIDebugPrintf("\n");
RBBIDebugPrintf(" |---------------");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf("---");
RBBIDebugPrintf("----");
}
RBBIDebugPrintf("\n");
@ -1666,7 +1673,7 @@ void RBBITableBuilder::printStates() {
RBBIDebugPrintf(" %3d | " , n);
RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
}
RBBIDebugPrintf("\n");
}

View file

@ -677,10 +677,10 @@ private:
typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32);
template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
template<typename RowType, PTrieFunc trieFunc>
int32_t handleSafePrevious(int32_t fromPosition);
template<typename RowType, PTrieFunc trieFunc, uint16_t dictMask>
template<typename RowType, PTrieFunc trieFunc>
int32_t handleNext();
@ -705,17 +705,6 @@ private:
* @internal
*/
void dumpTables();
/**
* Bit for dictionary based category
*/
static constexpr int32_t kDictBit = 0x4000;
/**
* Bit for dictionary based category in 8bits trie
*/
static constexpr int32_t kDictBitFor8BitsTrie = 0x0080;
#endif /* U_HIDE_INTERNAL_API */
};

View file

@ -4657,7 +4657,8 @@ void RBBITest::TestTableRedundancies() {
}
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
for (int c1=1; c1<numCharClasses; c1++) {
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
int limit = c1 < (int)fwtbl->fDictCategoriesStart ? fwtbl->fDictCategoriesStart : numCharClasses;
for (int c2 = c1+1; c2 < limit; c2++) {
if (columns.at(c1) == columns.at(c2)) {
errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
goto out;
@ -4952,15 +4953,15 @@ void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits
}
void RBBITest::Test8BitsTrieWith8BitStateTable() {
testTrieStateTable(123, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
testTrieStateTable(251, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
}
void RBBITest::Test16BitsTrieWith8BitStateTable() {
testTrieStateTable(124, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
testTrieStateTable(252, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */);
}
void RBBITest::Test16BitsTrieWith16BitStateTable() {
testTrieStateTable(255, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
testTrieStateTable(253, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */);
}
void RBBITest::Test8BitsTrieWith16BitStateTable() {

View file

@ -41,10 +41,20 @@ public final class RBBIDataWrapper {
* Length of a table row in bytes. Note mismatch with table data, which is short[].
*/
public int fRowLen;
/**
* Char category number of the first dictionary char class,
* or the the largest category number + 1 if there are no dictionary categories.
*/
public int fDictCategoriesStart;
/**
* Option Flags for this state table.
*/
public int fFlags;
/**
* Length in bytes of the state table header, of all the int32 fields
* preceding fTable in the serialized form.
*/
public static int fHeaderSize = 16;
/**
* Linear array of next state values, accessed as short[state, char_class]
*/
@ -57,14 +67,15 @@ public final class RBBIDataWrapper {
if (length == 0) {
return null;
}
if (length < 12) {
if (length < fHeaderSize) {
throw new IOException("Invalid RBBI state table length.");
}
RBBIStateTable This = new RBBIStateTable();
This.fNumStates = bytes.getInt();
This.fRowLen = bytes.getInt();
This.fDictCategoriesStart = bytes.getInt();
This.fFlags = bytes.getInt();
int lengthOfTable = length - 12; // length in bytes.
int lengthOfTable = length - fHeaderSize; // length in bytes.
boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS;
if (use8Bits) {
This.fTable = new char[lengthOfTable];
@ -82,6 +93,7 @@ public final class RBBIDataWrapper {
public int put(DataOutputStream bytes) throws IOException {
bytes.writeInt(fNumStates);
bytes.writeInt(fRowLen);
bytes.writeInt(fDictCategoriesStart);
bytes.writeInt(fFlags);
if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) {
int tableLen = fRowLen * fNumStates; // fRowLen is bytes.
@ -95,8 +107,8 @@ public final class RBBIDataWrapper {
bytes.writeChar(fTable[i]);
}
}
int bytesWritten = 12 + fRowLen * fNumStates; // total bytes written,
// including 12 for the header.
int bytesWritten = fHeaderSize + fRowLen * fNumStates; // total bytes written,
// including the header.
while (bytesWritten % 8 != 0) {
bytes.writeByte(0);
++bytesWritten;
@ -118,6 +130,7 @@ public final class RBBIDataWrapper {
RBBIStateTable otherST = (RBBIStateTable)other;
if (fNumStates != otherST.fNumStates) return false;
if (fRowLen != otherST.fRowLen) return false;
if (fDictCategoriesStart != otherST.fDictCategoriesStart) return false;
if (fFlags != otherST.fFlags) return false;
return Arrays.equals(fTable, otherST.fTable);
}
@ -216,9 +229,6 @@ public final class RBBIDataWrapper {
public final static int RBBI_BOF_REQUIRED = 2;
public final static int RBBI_8BITS_ROWS = 4;
public final static int DICT_BIT = 0x4000;
public final static int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
/**
* Data Header. A struct-like class with the fields from the RBBI data file header.
* Not intended for public use, declared public for testing purposes only.
@ -496,7 +506,6 @@ public final class RBBIDataWrapper {
int char32;
int category;
int lastNewline[] = new int[n+1];
int dictMask = fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? DICT_BIT_FOR_8BITS_TRIE : DICT_BIT;
for (category = 0; category <= fHeader.fCatCount; category ++) {
catStrings[category] = "";
@ -505,7 +514,6 @@ public final class RBBIDataWrapper {
out.println("--------------------");
for (char32 = 0; char32<=0x10ffff; char32++) {
category = fTrie.get(char32);
category &= ~dictMask; // Mask off dictionary bit.
if (category < 0 || category > fHeader.fCatCount) {
out.println("Error, bad category " + Integer.toHexString(category) +
" for char " + Integer.toHexString(char32));

View file

@ -67,7 +67,7 @@ class RBBIRuleBuilder {
//
// Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
//
Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far.
Map<Set<Integer>, Integer> fStatusSets = new HashMap<>(); // Status value sets encountered so far.
// Map Key is the set of values.
// Map Value is the runtime array index.
@ -146,8 +146,8 @@ class RBBIRuleBuilder {
ICUDebug.value("rbbi") : null;
fRules = rules;
fStrippedRules = new StringBuilder(rules);
fUSetNodes = new ArrayList<RBBINode>();
fRuleStatusVals = new ArrayList<Integer>();
fUSetNodes = new ArrayList<>();
fRuleStatusVals = new ArrayList<>();
fScanner = new RBBIRuleScanner(this);
fSetBuilder = new RBBISetBuilder(this);
}
@ -294,9 +294,7 @@ class RBBIRuleBuilder {
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
// Munge the Unicode Sets to create an initial set of character categories.
//
fSetBuilder.buildRanges();
@ -305,6 +303,10 @@ class RBBIRuleBuilder {
//
fForwardTable = new RBBITableBuilder(this, fForwardTree);
fForwardTable.buildForwardTable();
// State table and character category optimization.
// Merge equivalent rows and columns.
// Note that this process alters the the initial set of character categories,
// causing the representation of UnicodeSets in the parse tree to become invalid.
optimizeTables();
fForwardTable.buildSafeReverseTable();
@ -315,7 +317,9 @@ class RBBIRuleBuilder {
fForwardTable.printRuleStatusTable();
fForwardTable.printReverseTable();
}
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
fSetBuilder.buildTrie();
//
// Package up the compiled data, writing it to an output stream

View file

@ -29,7 +29,7 @@ import com.ibm.icu.util.MutableCodePointTrie;
// by the RBBI rules.
// - compute a set of non-overlapping character ranges
// with all characters within a range belonging to the same
// set of input uniocde sets.
// set of input unicode sets.
// - Derive a set of non-overlapping UnicodeSet (like things)
// that will correspond to columns in the state table for
// the RBBI execution engine. All characters within one
@ -41,23 +41,27 @@ import com.ibm.icu.util.MutableCodePointTrie;
//
class RBBISetBuilder {
static class RangeDescriptor {
int fStartChar; // Start of range, unicode 32 bit value.
int fEndChar; // End of range, unicode 32 bit value.
int fNum; // runtime-mapped input value for this range.
List<RBBINode> fIncludesSets; // vector of the the original
// Unicode sets that include this range.
// (Contains ptrs to uset nodes)
RangeDescriptor fNext; // Next RangeDescriptor in the linked list.
int fStartChar = 0; // Start of range, unicode 32 bit value.
int fEndChar = 0; // End of range, unicode 32 bit value.
int fNum = 0; // runtime-mapped input value for this range.
boolean fIncludesDict = false; // True if the range includes $dictionary.
boolean fFirstInGroup = false; // True if first range in a group with the same fNum.
List<RBBINode> fIncludesSets; // vector of the the original
// Unicode sets that include this range.
// (Contains ptrs to uset nodes)
RangeDescriptor fNext; // Next RangeDescriptor in the linked list.
RangeDescriptor() {
fIncludesSets = new ArrayList<RBBINode>();
fIncludesSets = new ArrayList<>();
}
RangeDescriptor(RangeDescriptor other) {
fStartChar = other.fStartChar;
fEndChar = other.fEndChar;
fNum = other.fNum;
fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
fIncludesDict = other.fIncludesDict;
fFirstInGroup = other.fFirstInGroup;
fIncludesSets = new ArrayList<>(other.fIncludesSets);
}
//-------------------------------------------------------------------------------------
@ -82,28 +86,18 @@ class RBBISetBuilder {
}
//-------------------------------------------------------------------------------------
//
// RangeDescriptor::setDictionaryFlag
//
// Character Category Numbers that include characters from
// the original Unicode Set named "dictionary" have bit 14
// set to 1. The RBBI runtime engine uses this to trigger
// use of the word dictionary.
//
// This function looks through the Unicode Sets that it
// (the range) includes, and sets the bit in fNum when
// "dictionary" is among them.
//
/**
* Test whether this range includes characters from the original Unicode Set named "dictionary".
*
* This function looks through the Unicode Sets that
* the range includes, checking for one named "dictionary"
*/
// TODO: a faster way would be to find the set node for
// "dictionary" just once, rather than looking it
// up by name every time.
//
// -------------------------------------------------------------------------------------
void setDictionaryFlag() {
int i;
for (i=0; i<this.fIncludesSets.size(); i++) {
boolean isDictionaryRange() {
for (int i=0; i<this.fIncludesSets.size(); i++) {
RBBINode usetNode = fIncludesSets.get(i);
String setName = "";
RBBINode setRef = usetNode.fParent;
@ -114,11 +108,10 @@ class RBBISetBuilder {
}
}
if (setName.equals("dictionary")) {
this.fNum |= DICT_BIT;
break;
return true;
}
}
return false;
}
}
@ -130,19 +123,18 @@ class RBBISetBuilder {
// the Unicode Sets.
CodePointTrie fFrozenTrie;
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.
// fGroupCount is the index of the last used group.
// fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
// State table column 0 is not used. Column 1 is for end-of-input.
// column 2 is for group 0. Funny counting.
/**
* Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
*/
int fGroupCount;
/**
* The number of the first dictionary char category.
* If there are no Dictionary categories, set to the last category + 1.
*/
int fDictCategoriesStart;
boolean fSawBOF;
static final int DICT_BIT = 0x4000;
static final int DICT_BIT_FOR_8BITS_TRIE = 0x0080;
//------------------------------------------------------------------------
//
@ -239,25 +231,49 @@ class RBBISetBuilder {
//
// Numbering: # 0 (state table column 0) is unused.
// # 1 is reserved - table column 1 is for end-of-input
// # 2 is reserved - table column 2 is for beginning-in-input
// # 2 is reserved - table column 2 is for beginning-of-input
// # 3 is the first range list.
//
RangeDescriptor rlSearchRange;
int dictGroupCount = 0;
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
rlRange.fNum = rlSearchRange.fNum;
rlRange.fIncludesDict = rlSearchRange.fIncludesDict;
break;
}
}
if (rlRange.fNum == 0) {
fGroupCount ++;
rlRange.fNum = fGroupCount+2;
rlRange.setDictionaryFlag();
addValToSets(rlRange.fIncludesSets, fGroupCount+2);
rlRange.fFirstInGroup = true;
if (rlRange.isDictionaryRange()) {
rlRange.fNum = ++dictGroupCount;
rlRange.fIncludesDict = true;
} else {
fGroupCount++;
rlRange.fNum = fGroupCount + 2;
addValToSets(rlRange.fIncludesSets, fGroupCount + 2);
}
}
}
// Move the character category numbers for any dictionary ranges up, so that they
// immediately follow the non-dictionary ranges.
fDictCategoriesStart = fGroupCount + 3;
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
if (rlRange.fIncludesDict) {
rlRange.fNum += fDictCategoriesStart - 1;
if (rlRange.fFirstInGroup) {
addValToSets(rlRange.fIncludesSets, rlRange.fNum);
}
}
}
fGroupCount += dictGroupCount;
// Handle input sets that contain the special string {eof}.
// Column 1 of the state table is reserved for EOF on input.
// Column 2 is reserved for before-the-start-input.
@ -288,31 +304,21 @@ class RBBISetBuilder {
}
private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 127;
private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 255;
/**
* Build the Trie table for mapping UChar32 values to the corresponding
* range group number.
*/
void buildTrie() {
boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
RangeDescriptor rlRange;
fTrie = new MutableCodePointTrie(0, // Initial value for all code points.
0); // Error value for out-of-range input.
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
int value = rlRange.fNum;
if (use8Bits && ((value & DICT_BIT) != 0)) {
assert((value & DICT_BIT_FOR_8BITS_TRIE) == 0);
// switch to the bit from DICT_BIT to DICT_BIT_FOR_8BITS_TRIE
value = DICT_BIT_FOR_8BITS_TRIE | (value & ~DICT_BIT);
}
fTrie.setRange(
rlRange.fStartChar, // Range start
rlRange.fEndChar, // Range end (inclusive)
value // value for range
);
for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
fTrie.setRange(rlRange.fStartChar, // Range start
rlRange.fEndChar, // Range end (inclusive)
rlRange.fNum // value for range
);
}
}
@ -324,16 +330,20 @@ class RBBISetBuilder {
void mergeCategories(IntPair categories) {
assert(categories.first >= 1);
assert(categories.second > categories.first);
assert((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) ||
(categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) {
int rangeNum = rd.fNum & ~DICT_BIT;
int rangeDict = rd.fNum & DICT_BIT;
int rangeNum = rd.fNum;
if (rangeNum == categories.second) {
rd.fNum = categories.first | rangeDict;
rd.fNum = categories.first;
} else if (rangeNum > categories.second) {
rd.fNum--;
}
}
--fGroupCount;
if (categories.second <= fDictCategoriesStart) {
--fDictCategoriesStart;
}
}
//-----------------------------------------------------------------------------------
@ -425,6 +435,16 @@ class RBBISetBuilder {
}
//------------------------------------------------------------------------
//
// getDictCategoriesStart
//
//------------------------------------------------------------------------
int getDictCategoriesStart() {
return fDictCategoriesStart;
}
//------------------------------------------------------------------------
//
// sawBOF
@ -454,7 +474,6 @@ class RBBISetBuilder {
}
//------------------------------------------------------------------------
//
// printRanges A debugging function.
@ -468,7 +487,7 @@ class RBBISetBuilder {
System.out.print("\n\n Nonoverlapping Ranges ...\n");
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
System.out.print(" " + rlRange.fNum + " " + rlRange.fStartChar + "-" + rlRange.fEndChar);
System.out.printf("%04x-%04x ", rlRange.fStartChar, rlRange.fEndChar);
for (i=0; i<rlRange.fIncludesSets.size(); i++) {
RBBINode usetNode = rlRange.fIncludesSets.get(i);
@ -496,20 +515,16 @@ class RBBISetBuilder {
//------------------------------------------------------------------------
///CLOVER:OFF
void printRangeGroups() {
RangeDescriptor rlRange;
RangeDescriptor tRange;
int i;
int lastPrintedGroupNum = 0;
System.out.print("\nRanges grouped by Unicode Set Membership...\n");
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
int groupNum = rlRange.fNum & 0xbfff;
if (groupNum > lastPrintedGroupNum) {
lastPrintedGroupNum = groupNum;
for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
if (rlRange.fFirstInGroup) {
int groupNum = rlRange.fNum;
if (groupNum<10) {System.out.print(" ");}
System.out.print(groupNum + " ");
if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" <DICT> ");}
if (groupNum >= fDictCategoriesStart) { System.out.print(" <DICT> ");}
for (i=0; i<rlRange.fIncludesSets.size(); i++) {
RBBINode usetNode = rlRange.fIncludesSets.get(i);
@ -525,7 +540,7 @@ class RBBISetBuilder {
}
i = 0;
for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
for (RangeDescriptor tRange = rlRange; tRange != null; tRange = tRange.fNext) {
if (tRange.fNum == rlRange.fNum) {
if (i++ % 5 == 0) {
System.out.print("\n ");

View file

@ -905,7 +905,13 @@ class RBBITableBuilder {
int table_base = 0;
int table_dupl = 0;
for (; categories.first < numCols-1; ++categories.first) {
for (categories.second=categories.first+1; categories.second < numCols; ++categories.second) {
// Note: dictionary & non-dictionary columns cannot be merged.
// The limitSecond value prevents considering mixed pairs.
// Dictionary categories are >= DictCategoriesStart.
// Non dict categories are < DictCategoriesStart.
int limitSecond = categories.first < fRB.fSetBuilder.getDictCategoriesStart() ?
fRB.fSetBuilder.getDictCategoriesStart() : numCols;
for (categories.second=categories.first+1; categories.second < limitSecond; ++categories.second) {
for (int state=0; state<numStates; state++) {
RBBIStateDescriptor sd = fDStates.get(state);
table_base = sd.fDtran[categories.first];
@ -1103,7 +1109,7 @@ class RBBITableBuilder {
if (fRB.fTreeRoots[fRootIx] == null) {
return 0;
}
int size = 12; // The header of 4 ints, with no rows to the table.
int size = RBBIDataWrapper.RBBIStateTable.fHeaderSize; // The header, with no rows to the table.
int numRows = fDStates.size();
int numCols = fRB.fSetBuilder.getNumCharCategories();
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
@ -1132,17 +1138,18 @@ class RBBITableBuilder {
Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
fDStates.size() < 0x7fff);
table.fNumStates = fDStates.size();
table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart();
boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
// Size of table size in shorts.
int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts.
int tableSize;
if (use8Bits) {
tableSize = (getTableSize() - 12); // fTable length in bytes.
tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize); // fTable length in bytes.
table.fTable = new char[tableSize];
table.fRowLen = rowLen; // Row length in bytes.
} else {
tableSize = (getTableSize() - 12) / 2; // fTable length in shorts.
tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize) / 2; // fTable length in shorts.
table.fTable = new char[tableSize];
table.fRowLen = rowLen * 2; // Row length in bytes.
}
@ -1275,7 +1282,7 @@ class RBBITableBuilder {
if (fSafeTable == null) {
return 0;
}
int size = 12; // The header of 4 ints, with no rows to the table.
int size = RBBIDataWrapper.RBBIStateTable.fHeaderSize; // The header, with no rows to the table.
int numRows = fSafeTable.size();
int numCols = fSafeTable.get(0).length;
boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
@ -1303,7 +1310,7 @@ class RBBITableBuilder {
int rowLen = RBBIDataWrapper.NEXTSTATES + numCharCategories;
// TODO: tableSize is basically numStates * numCharCategories,
// except for alignment padding. Clean up here, and in main exportTable().
int tableSize = (getSafeTableSize() - 12); // fTable length in bytes.
int tableSize = (getSafeTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize); // fTable length in bytes.
if (use8Bits) {
table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS;
table.fTable = new char[tableSize];
@ -1357,12 +1364,12 @@ class RBBITableBuilder {
System.out.print("state | i n p u t s y m b o l s \n");
System.out.print(" | Acc LA Tag");
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
RBBINode.printInt(c, 3);
RBBINode.printInt(c, 4);
}
System.out.print("\n");
System.out.print(" |---------------");
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
System.out.print("---");
System.out.print("----");
}
System.out.print("\n");
@ -1376,7 +1383,7 @@ class RBBITableBuilder {
RBBINode.printInt(sd.fTagsIdx, 6);
System.out.print(" ");
for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
RBBINode.printInt(sd.fDtran[c], 3);
RBBINode.printInt(sd.fDtran[c], 4);
}
System.out.print("\n");
}

View file

@ -843,9 +843,8 @@ public class RuleBasedBreakIterator extends BreakIterator {
int row = fRData.getRowIndex(state);
short category = 3;
int flagsState = fRData.fFTable.fFlags;
int dictStart = fRData.fFTable.fDictCategoriesStart;
int mode = RBBI_RUN;
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
@ -882,15 +881,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
//
category = (short) trie.get(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & dictMask) != 0) {
// Check for categories that require word dictionary handling.
if (category >= dictStart) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~dictMask;
}
if (TRACE) {
@ -1004,9 +997,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
CharacterIterator text = fText;
CodePointTrie trie = fRData.fTrie;
char[] stateTable = fRData.fRTable.fTable;
int flagsState = fRData.fRTable.fFlags;
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
CISetIndex32(text, fromPosition);
if (TRACE) {
@ -1032,7 +1022,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
//
// And off the dictionary flag bit. For reverse iteration it is not used.
category = (short) trie.get(c);
category &= ~dictMask;
if (TRACE) {
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
@ -1212,8 +1201,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
int category;
int current;
int foundBreakCount = 0;
int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ?
RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT;
// Loop through the text, looking for ranges of dictionary characters.
// For each span, find the appropriate break engine, and ask it to find
@ -1222,9 +1209,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
fText.setIndex(rangeStart);
int c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.get(c);
int dictStart = fRData.fFTable.fDictCategoriesStart;
while(true) {
while((current = fText.getIndex()) < rangeEnd && (category & dictMask) == 0) {
while((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
c = CharacterIteration.next32(fText); // pre-increment
category = (short)fRData.fTrie.get(c);
}

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bdf00a19b05bc52e17c2aea74e87cc1872a824d5a9cced226078c46a194a8799
size 13141762
oid sha256:53e4c3251f31233ffcfe3ff4229ea43d81422a3fa071ee774ed835e5e969d22c
size 13142859

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6d2882ccb44134313ff0365eb24776d4e859fa9dd223f10d608d65fdfd7f23d9
oid sha256:72b712d8d19a5aa8d1cb36f070337010c29595c63d917cf81e3213a5ea5be2e7
size 94529

View file

@ -408,7 +408,7 @@ public class RBBITest extends TestFmwk {
}
}
List<Thread> threads = new ArrayList<Thread>();
List<Thread> threads = new ArrayList<>();
for (int n = 0; n<4; ++n) {
threads.add(new Thread(new WorkerThread()));
}
@ -513,7 +513,7 @@ public class RBBITest extends TestFmwk {
}
private static final BreakIterator BREAK_ITERATOR_CACHE = BreakIterator.getWordInstance(ULocale.ROOT);
public static List<Integer> getBoundary(String toParse) {
List<Integer> retVal = new ArrayList<Integer>();
List<Integer> retVal = new ArrayList<>();
BreakIterator bi = (BreakIterator) BREAK_ITERATOR_CACHE.clone();
bi.setText(toParse);
for (int boundary=bi.first(); boundary != BreakIterator.DONE; boundary = bi.next()) {
@ -579,19 +579,20 @@ public class RBBITest extends TestFmwk {
int numCharClasses = dw.fHeader.fCatCount;
// Check for duplicate columns (character categories)
List<String> columns = new ArrayList<String>();
List<String> columns = new ArrayList<>();
for (int column=0; column<numCharClasses; column++) {
StringBuilder s = new StringBuilder();
for (int r = 1; r < fwtbl.fNumStates; r++) {
int row = dw.getRowIndex(r);
char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column];
s.append((char)tableVal);
s.append(tableVal);
}
columns.add(s.toString());
}
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
for (int c1=1; c1<numCharClasses; c1++) {
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
int limit = c1 < fwtbl.fDictCategoriesStart ? fwtbl.fDictCategoriesStart : numCharClasses;
for (int c2 = c1+1; c2 < limit; c2++) {
assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
// if (columns.get(c1).equals(columns.get(c2))) {
// System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
@ -600,7 +601,7 @@ public class RBBITest extends TestFmwk {
}
// Check for duplicate states.
List<String> rows = new ArrayList<String>();
List<String> rows = new ArrayList<>();
for (int r=0; r<fwtbl.fNumStates; r++) {
StringBuilder s = new StringBuilder();
int row = dw.getRowIndex(r);
@ -643,7 +644,7 @@ public class RBBITest extends TestFmwk {
public void TestTableRebuild() {
// Test to verify that rebuilding the state tables from rule source for the standard
// break iterator types yields the same tables as are imported from ICU4C as part of the default data.
List<RuleBasedBreakIterator> breakIterators = new ArrayList<RuleBasedBreakIterator>();
List<RuleBasedBreakIterator> breakIterators = new ArrayList<>();
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getCharacterInstance(Locale.ENGLISH));
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getWordInstance(Locale.ENGLISH));
breakIterators.add((RuleBasedBreakIterator)BreakIterator.getSentenceInstance(Locale.ENGLISH));
@ -723,17 +724,17 @@ public class RBBITest extends TestFmwk {
@Test
public void Test8BitsTrieWith8BitStateTable() {
testTrieStateTable(123, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
testTrieStateTable(251, true /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
}
@Test
public void Test16BitsTrieWith8BitStateTable() {
testTrieStateTable(124, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
testTrieStateTable(252, false /* expectUCPTrieValueWidthIn8Bits */, true /* expectStateRowIn8Bits */);
}
@Test
public void Test16BitsTrieWith16BitStateTable() {
testTrieStateTable(255, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
testTrieStateTable(253, false /* expectUCPTrieValueWidthIn8Bits */, false /* expectStateRowIn8Bits */);
}
@Test