mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 23:10:40 +00:00
ICU-12507 ICU4J RBBI, switch to UTrie2
X-SVN-Rev: 40112
This commit is contained in:
parent
a3a2b57516
commit
9d12b335cc
5 changed files with 50 additions and 99 deletions
|
@ -13,10 +13,9 @@ import java.io.IOException;
|
|||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
import com.ibm.icu.impl.CharTrie;
|
||||
import com.ibm.icu.impl.ICUBinary;
|
||||
import com.ibm.icu.impl.ICUBinary.Authenticate;
|
||||
import com.ibm.icu.impl.Trie;
|
||||
import com.ibm.icu.impl.Trie2;
|
||||
|
||||
/**
|
||||
* <p>Internal class used for Rule Based Break Iterators</p>
|
||||
|
@ -33,7 +32,7 @@ final class RBBIDataWrapper {
|
|||
short fRTable[];
|
||||
short fSFTable[];
|
||||
short fSRTable[];
|
||||
CharTrie fTrie;
|
||||
Trie2 fTrie;
|
||||
String fRuleSource;
|
||||
int fStatusTable[];
|
||||
|
||||
|
@ -147,19 +146,6 @@ final class RBBIDataWrapper {
|
|||
return ROW_DATA + state * (fHeader.fCatCount + 4);
|
||||
}
|
||||
|
||||
static class TrieFoldingFunc implements Trie.DataManipulate {
|
||||
@Override
|
||||
public int getFoldingOffset(int data) {
|
||||
if ((data & 0x8000) != 0) {
|
||||
return data & 0x7fff;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
|
||||
|
||||
|
||||
RBBIDataWrapper() {
|
||||
}
|
||||
|
||||
|
@ -286,7 +272,7 @@ final class RBBIDataWrapper {
|
|||
// as we don't go more than 100 bytes past the
|
||||
// past the end of the TRIE.
|
||||
|
||||
This.fTrie = new CharTrie(bytes, fTrieFoldingFunc); // Deserialize the TRIE, leaving buffer
|
||||
This.fTrie = Trie2.createFromSerialized(bytes); // Deserialize the TRIE, leaving buffer
|
||||
// at an unknown position, preceding the
|
||||
// padding between TRIE and following section.
|
||||
|
||||
|
@ -461,7 +447,7 @@ final class RBBIDataWrapper {
|
|||
out.println("\nCharacter Categories");
|
||||
out.println("--------------------");
|
||||
for (char32 = 0; char32<=0x10ffff; char32++) {
|
||||
category = fTrie.getCodePointValue(char32);
|
||||
category = fTrie.get(char32);
|
||||
category &= ~0x4000; // Mask off dictionary bit.
|
||||
if (category < 0 || category > fHeader.fCatCount) {
|
||||
out.println("Error, bad category " + Integer.toHexString(category) +
|
||||
|
|
|
@ -14,7 +14,8 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.impl.IntTrieBuilder;
|
||||
import com.ibm.icu.impl.Trie2Writable;
|
||||
import com.ibm.icu.impl.Trie2_16;
|
||||
|
||||
//
|
||||
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
|
||||
|
@ -49,14 +50,14 @@ class RBBISetBuilder {
|
|||
RangeDescriptor() {
|
||||
fIncludesSets = new ArrayList<RBBINode>();
|
||||
}
|
||||
|
||||
|
||||
RangeDescriptor(RangeDescriptor other) {
|
||||
fStartChar = other.fStartChar;
|
||||
fEndChar = other.fEndChar;
|
||||
fNum = other.fNum;
|
||||
fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
|
||||
}
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDesriptor::split()
|
||||
|
@ -65,20 +66,20 @@ class RBBISetBuilder {
|
|||
void split(int where) {
|
||||
Assert.assrt(where>fStartChar && where<=fEndChar);
|
||||
RangeDescriptor nr = new RangeDescriptor(this);
|
||||
|
||||
|
||||
// RangeDescriptor copy constructor copies all fields.
|
||||
// Only need to update those that are different after the split.
|
||||
nr.fStartChar = where;
|
||||
this.fEndChar = where-1;
|
||||
nr.fNext = this.fNext;
|
||||
this.fNext = nr;
|
||||
|
||||
|
||||
// TODO: fIncludesSets is not updated. Check it out.
|
||||
// Probably because they haven't been populated yet,
|
||||
// Probably because they haven't been populated yet,
|
||||
// but still sloppy.
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// RangeDescriptor::setDictionaryFlag
|
||||
|
@ -95,11 +96,11 @@ class RBBISetBuilder {
|
|||
// TODO: a faster way would be to find the set node for
|
||||
// "dictionary" just once, rather than looking it
|
||||
// up by name every time.
|
||||
//
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
void setDictionaryFlag() {
|
||||
int i;
|
||||
|
||||
|
||||
for (i=0; i<this.fIncludesSets.size(); i++) {
|
||||
RBBINode usetNode = fIncludesSets.get(i);
|
||||
String setName = "";
|
||||
|
@ -119,12 +120,13 @@ class RBBISetBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us.
|
||||
RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors
|
||||
|
||||
IntTrieBuilder fTrie; // The mapping TRIE that is the end result of processing
|
||||
Trie2Writable fTrie; // The mapping TRIE that is the end result of processing
|
||||
// the Unicode Sets.
|
||||
Trie2_16 fFrozenTrie;
|
||||
|
||||
// Groups correspond to character categories -
|
||||
// groups of ranges that are in the same original UnicodeSets.
|
||||
|
@ -135,8 +137,8 @@ class RBBISetBuilder {
|
|||
int fGroupCount;
|
||||
|
||||
boolean fSawBOF;
|
||||
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// RBBISetBuilder Constructor
|
||||
|
@ -162,7 +164,7 @@ class RBBISetBuilder {
|
|||
// Initialize the process by creating a single range encompassing all characters
|
||||
// that is in no sets.
|
||||
//
|
||||
fRangeList = new RangeDescriptor();
|
||||
fRangeList = new RangeDescriptor();
|
||||
fRangeList.fStartChar = 0;
|
||||
fRangeList.fEndChar = 0x10ffff;
|
||||
|
||||
|
@ -245,7 +247,7 @@ class RBBISetBuilder {
|
|||
}
|
||||
if (rlRange.fNum == 0) {
|
||||
fGroupCount ++;
|
||||
rlRange.fNum = fGroupCount+2;
|
||||
rlRange.fNum = fGroupCount+2;
|
||||
rlRange.setDictionaryFlag();
|
||||
addValToSets(rlRange.fIncludesSets, fGroupCount+2);
|
||||
}
|
||||
|
@ -260,7 +262,7 @@ class RBBISetBuilder {
|
|||
// subtree for each UnicodeSet that contains the string {eof}
|
||||
// Because {bof} and {eof} are not a characters in the normal sense,
|
||||
// they doesn't affect the computation of ranges or TRIE.
|
||||
|
||||
|
||||
String eofString = "eof";
|
||||
String bofString = "bof";
|
||||
|
||||
|
@ -279,67 +281,26 @@ class RBBISetBuilder {
|
|||
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
|
||||
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
|
||||
|
||||
fTrie = new Trie2Writable(0, // Initial value for all code points
|
||||
0); // Error value.
|
||||
|
||||
//IntTrieBuilder(int aliasdata[], int maxdatalength,
|
||||
// int initialvalue, int leadunitvalue,
|
||||
// boolean latin1linear)
|
||||
|
||||
fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one)
|
||||
100000, // Max Data Length
|
||||
0, // Initial value for all code points
|
||||
0, // Lead Surrogate unit value,
|
||||
true); // Keep Latin 1 in separately.
|
||||
|
||||
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
|
||||
fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true);
|
||||
fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar, rlRange.fNum, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// RBBIDataManipulate A little internal class needed only to wrap of the
|
||||
// getFoldedValue() function needed for Trie table creation.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
class RBBIDataManipulate implements IntTrieBuilder.DataManipulate {
|
||||
public int getFoldedValue(int start, int offset) {
|
||||
int value;
|
||||
int limit;
|
||||
boolean [] inBlockZero = new boolean[1];
|
||||
|
||||
limit = start + 0x400;
|
||||
while(start<limit) {
|
||||
value = fTrie.getValue(start, inBlockZero);
|
||||
if (inBlockZero[0]) {
|
||||
start += IntTrieBuilder.DATA_BLOCK_LENGTH;
|
||||
} else if (value != 0) {
|
||||
return offset | 0x08000;
|
||||
} else {
|
||||
++start;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
RBBIDataManipulate dm = new RBBIDataManipulate();
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// getTrieSize() Return the size that will be required to serialize the Trie.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int getTrieSize() {
|
||||
int size = 0;
|
||||
try {
|
||||
// The trie serialize function returns the size of the data written.
|
||||
// null output stream says give size only, don't actually write anything.
|
||||
size = fTrie.serialize(null, true, dm );
|
||||
} catch (IOException e) {
|
||||
Assert.assrt (false);
|
||||
if (fFrozenTrie == null) {
|
||||
fFrozenTrie = fTrie.toTrie2_16();
|
||||
fTrie = null;
|
||||
}
|
||||
return size;
|
||||
return fFrozenTrie.getSerializedLength();
|
||||
}
|
||||
|
||||
|
||||
|
@ -349,7 +310,11 @@ class RBBISetBuilder {
|
|||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
void serializeTrie(OutputStream os) throws IOException {
|
||||
fTrie.serialize(os, true, dm );
|
||||
if (fFrozenTrie == null) {
|
||||
fFrozenTrie = fTrie.toTrie2_16();
|
||||
fTrie = null;
|
||||
}
|
||||
fFrozenTrie.serialize(os);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
|
@ -416,7 +381,7 @@ class RBBISetBuilder {
|
|||
//------------------------------------------------------------------------
|
||||
//
|
||||
// getFirstChar Given a runtime RBBI character category, find
|
||||
// the first UChar32 that is in the set of chars
|
||||
// the first UChar32 that is in the set of chars
|
||||
// in the category.
|
||||
//------------------------------------------------------------------------
|
||||
int getFirstChar(int category) {
|
||||
|
|
|
@ -24,10 +24,10 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.impl.CharTrie;
|
||||
import com.ibm.icu.impl.CharacterIteration;
|
||||
import com.ibm.icu.impl.ICUBinary;
|
||||
import com.ibm.icu.impl.ICUDebug;
|
||||
import com.ibm.icu.impl.Trie2;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
|
@ -495,7 +495,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI();
|
||||
int foundBreakCount = 0;
|
||||
int c = CharacterIteration.current32(fText);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
|
||||
// Is the character we're starting on a dictionary character? If so, we
|
||||
// need to back up to include the entire run; otherwise the results of
|
||||
|
@ -507,7 +507,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
do {
|
||||
CharacterIteration.next32(fText);
|
||||
c = CharacterIteration.current32(fText);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
} while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);
|
||||
|
||||
// Back up to the last dictionary character
|
||||
|
@ -524,7 +524,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
else {
|
||||
do {
|
||||
c = CharacterIteration.previous32(fText);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
}
|
||||
while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0));
|
||||
// Back up to the last dictionary character
|
||||
|
@ -538,7 +538,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
}
|
||||
rangeStart = fText.getIndex();
|
||||
}
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
}
|
||||
|
||||
|
||||
|
@ -550,14 +550,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
if (reverse) {
|
||||
fText.setIndex(rangeStart);
|
||||
c = CharacterIteration.current32(fText);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
}
|
||||
LanguageBreakEngine lbe = null;
|
||||
while(true) {
|
||||
while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
|
||||
CharacterIteration.next32(fText);
|
||||
c = CharacterIteration.current32(fText);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
}
|
||||
if (current >= rangeEnd) {
|
||||
break;
|
||||
|
@ -577,7 +577,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
|
||||
// Reload the loop variables for the next go-round
|
||||
c = CharacterIteration.current32(fText);
|
||||
category = (short)fRData.fTrie.getCodePointValue(c);
|
||||
category = (short)fRData.fTrie.get(c);
|
||||
}
|
||||
|
||||
// If we found breaks, build a new break cache. The first and last entries must
|
||||
|
@ -1285,7 +1285,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
|
||||
// caches for quicker access
|
||||
CharacterIterator text = fText;
|
||||
CharTrie trie = fRData.fTrie;
|
||||
Trie2 trie = fRData.fTrie;
|
||||
|
||||
// Set up the starting char
|
||||
int c = text.current();
|
||||
|
@ -1338,7 +1338,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
//
|
||||
category = (short) trie.getCodePointValue(c);
|
||||
category = (short) trie.get(c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
|
@ -1504,7 +1504,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
// look up the current character's category, which tells us
|
||||
// which column in the state table to look at.
|
||||
//
|
||||
category = (short) fRData.fTrie.getCodePointValue(c);
|
||||
category = (short) fRData.fTrie.get(c);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:37a4591c3864eb4156671247d3ee06169dd489f65854000badacf0ef7b334a9d
|
||||
size 12127235
|
||||
oid sha256:d315546f344483688e78322304130697164e0d0363b20ed00880598630632341
|
||||
size 12128031
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3c8e833db5b89ace58dde83087599684772a68742408e8a99c9a43b1c9d00c7f
|
||||
size 92449
|
||||
oid sha256:17fb194e1234c73ab09442acf76f1b872d77d8aa7494a06f5964f1342616d69e
|
||||
size 92448
|
||||
|
|
Loading…
Add table
Reference in a new issue