ICU-12507 ICU4J RBBI, switch to UTrie2

X-SVN-Rev: 40112
This commit is contained in:
Andy Heninger 2017-05-04 22:30:40 +00:00
parent a3a2b57516
commit 9d12b335cc
5 changed files with 50 additions and 99 deletions

View file

@ -13,10 +13,9 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import com.ibm.icu.impl.CharTrie;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUBinary.Authenticate;
import com.ibm.icu.impl.Trie;
import com.ibm.icu.impl.Trie2;
/**
* <p>Internal class used for Rule Based Break Iterators</p>
@ -33,7 +32,7 @@ final class RBBIDataWrapper {
short fRTable[];
short fSFTable[];
short fSRTable[];
CharTrie fTrie;
Trie2 fTrie;
String fRuleSource;
int fStatusTable[];
@ -147,19 +146,6 @@ final class RBBIDataWrapper {
return ROW_DATA + state * (fHeader.fCatCount + 4);
}
static class TrieFoldingFunc implements Trie.DataManipulate {
@Override
public int getFoldingOffset(int data) {
if ((data & 0x8000) != 0) {
return data & 0x7fff;
} else {
return 0;
}
}
}
static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc();
RBBIDataWrapper() {
}
@ -286,7 +272,7 @@ final class RBBIDataWrapper {
// as we don't go more than 100 bytes past the
// past the end of the TRIE.
This.fTrie = new CharTrie(bytes, fTrieFoldingFunc); // Deserialize the TRIE, leaving buffer
This.fTrie = Trie2.createFromSerialized(bytes); // Deserialize the TRIE, leaving buffer
// at an unknown position, preceding the
// padding between TRIE and following section.
@ -461,7 +447,7 @@ final class RBBIDataWrapper {
out.println("\nCharacter Categories");
out.println("--------------------");
for (char32 = 0; char32<=0x10ffff; char32++) {
category = fTrie.getCodePointValue(char32);
category = fTrie.get(char32);
category &= ~0x4000; // Mask off dictionary bit.
if (category < 0 || category > fHeader.fCatCount) {
out.println("Error, bad category " + Integer.toHexString(category) +

View file

@ -14,7 +14,8 @@ import java.util.ArrayList;
import java.util.List;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.IntTrieBuilder;
import com.ibm.icu.impl.Trie2Writable;
import com.ibm.icu.impl.Trie2_16;
//
// RBBISetBuilder Handles processing of Unicode Sets from RBBI rules
@ -49,14 +50,14 @@ class RBBISetBuilder {
RangeDescriptor() {
fIncludesSets = new ArrayList<RBBINode>();
}
RangeDescriptor(RangeDescriptor other) {
fStartChar = other.fStartChar;
fEndChar = other.fEndChar;
fNum = other.fNum;
fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
}
//-------------------------------------------------------------------------------------
//
// RangeDesriptor::split()
@ -65,20 +66,20 @@ class RBBISetBuilder {
void split(int where) {
Assert.assrt(where>fStartChar && where<=fEndChar);
RangeDescriptor nr = new RangeDescriptor(this);
// RangeDescriptor copy constructor copies all fields.
// Only need to update those that are different after the split.
nr.fStartChar = where;
this.fEndChar = where-1;
nr.fNext = this.fNext;
this.fNext = nr;
// TODO: fIncludesSets is not updated. Check it out.
// Probably because they haven't been populated yet,
// Probably because they haven't been populated yet,
// but still sloppy.
}
//-------------------------------------------------------------------------------------
//
// RangeDescriptor::setDictionaryFlag
@ -95,11 +96,11 @@ class RBBISetBuilder {
// TODO: a faster way would be to find the set node for
// "dictionary" just once, rather than looking it
// up by name every time.
//
//
// -------------------------------------------------------------------------------------
void setDictionaryFlag() {
int i;
for (i=0; i<this.fIncludesSets.size(); i++) {
RBBINode usetNode = fIncludesSets.get(i);
String setName = "";
@ -119,12 +120,13 @@ class RBBISetBuilder {
}
}
RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us.
RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors
IntTrieBuilder fTrie; // The mapping TRIE that is the end result of processing
Trie2Writable fTrie; // The mapping TRIE that is the end result of processing
// the Unicode Sets.
Trie2_16 fFrozenTrie;
// Groups correspond to character categories -
// groups of ranges that are in the same original UnicodeSets.
@ -135,8 +137,8 @@ class RBBISetBuilder {
int fGroupCount;
boolean fSawBOF;
//------------------------------------------------------------------------
//
// RBBISetBuilder Constructor
@ -162,7 +164,7 @@ class RBBISetBuilder {
// Initialize the process by creating a single range encompassing all characters
// that is in no sets.
//
fRangeList = new RangeDescriptor();
fRangeList = new RangeDescriptor();
fRangeList.fStartChar = 0;
fRangeList.fEndChar = 0x10ffff;
@ -245,7 +247,7 @@ class RBBISetBuilder {
}
if (rlRange.fNum == 0) {
fGroupCount ++;
rlRange.fNum = fGroupCount+2;
rlRange.fNum = fGroupCount+2;
rlRange.setDictionaryFlag();
addValToSets(rlRange.fIncludesSets, fGroupCount+2);
}
@ -260,7 +262,7 @@ class RBBISetBuilder {
// subtree for each UnicodeSet that contains the string {eof}
// Because {bof} and {eof} are not a characters in the normal sense,
// they doesn't affect the computation of ranges or TRIE.
String eofString = "eof";
String bofString = "bof";
@ -279,67 +281,26 @@ class RBBISetBuilder {
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
fTrie = new Trie2Writable(0, // Initial value for all code points
0); // Error value.
//IntTrieBuilder(int aliasdata[], int maxdatalength,
// int initialvalue, int leadunitvalue,
// boolean latin1linear)
fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one)
100000, // Max Data Length
0, // Initial value for all code points
0, // Lead Surrogate unit value,
true); // Keep Latin 1 in separately.
for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true);
fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar, rlRange.fNum, true);
}
}
//-----------------------------------------------------------------------------------
//
// RBBIDataManipulate A little internal class needed only to wrap of the
// getFoldedValue() function needed for Trie table creation.
//
//-----------------------------------------------------------------------------------
class RBBIDataManipulate implements IntTrieBuilder.DataManipulate {
public int getFoldedValue(int start, int offset) {
int value;
int limit;
boolean [] inBlockZero = new boolean[1];
limit = start + 0x400;
while(start<limit) {
value = fTrie.getValue(start, inBlockZero);
if (inBlockZero[0]) {
start += IntTrieBuilder.DATA_BLOCK_LENGTH;
} else if (value != 0) {
return offset | 0x08000;
} else {
++start;
}
}
return 0;
}
}
RBBIDataManipulate dm = new RBBIDataManipulate();
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.
//
//-----------------------------------------------------------------------------------
int getTrieSize() {
int size = 0;
try {
// The trie serialize function returns the size of the data written.
// null output stream says give size only, don't actually write anything.
size = fTrie.serialize(null, true, dm );
} catch (IOException e) {
Assert.assrt (false);
if (fFrozenTrie == null) {
fFrozenTrie = fTrie.toTrie2_16();
fTrie = null;
}
return size;
return fFrozenTrie.getSerializedLength();
}
@ -349,7 +310,11 @@ class RBBISetBuilder {
//
//-----------------------------------------------------------------------------------
void serializeTrie(OutputStream os) throws IOException {
fTrie.serialize(os, true, dm );
if (fFrozenTrie == null) {
fFrozenTrie = fTrie.toTrie2_16();
fTrie = null;
}
fFrozenTrie.serialize(os);
}
//------------------------------------------------------------------------
@ -416,7 +381,7 @@ class RBBISetBuilder {
//------------------------------------------------------------------------
//
// getFirstChar Given a runtime RBBI character category, find
// the first UChar32 that is in the set of chars
// the first UChar32 that is in the set of chars
// in the category.
//------------------------------------------------------------------------
int getFirstChar(int category) {

View file

@ -24,10 +24,10 @@ import java.util.ArrayList;
import java.util.List;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.CharTrie;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.Trie2;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
@ -495,7 +495,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
DictionaryBreakEngine.DequeI breaks = new DictionaryBreakEngine.DequeI();
int foundBreakCount = 0;
int c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
// Is the character we're starting on a dictionary character? If so, we
// need to back up to include the entire run; otherwise the results of
@ -507,7 +507,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
do {
CharacterIteration.next32(fText);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
} while (c != CharacterIteration.DONE32 && ((category & 0x4000)) != 0);
// Back up to the last dictionary character
@ -524,7 +524,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
else {
do {
c = CharacterIteration.previous32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
while (c != CharacterIteration.DONE32 && ((category & 0x4000) != 0));
// Back up to the last dictionary character
@ -538,7 +538,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
}
rangeStart = fText.getIndex();
}
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
@ -550,14 +550,14 @@ public class RuleBasedBreakIterator extends BreakIterator {
if (reverse) {
fText.setIndex(rangeStart);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
LanguageBreakEngine lbe = null;
while(true) {
while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) {
CharacterIteration.next32(fText);
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
if (current >= rangeEnd) {
break;
@ -577,7 +577,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// Reload the loop variables for the next go-round
c = CharacterIteration.current32(fText);
category = (short)fRData.fTrie.getCodePointValue(c);
category = (short)fRData.fTrie.get(c);
}
// If we found breaks, build a new break cache. The first and last entries must
@ -1285,7 +1285,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// caches for quicker access
CharacterIterator text = fText;
CharTrie trie = fRData.fTrie;
Trie2 trie = fRData.fTrie;
// Set up the starting char
int c = text.current();
@ -1338,7 +1338,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
category = (short) trie.getCodePointValue(c);
category = (short) trie.get(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
@ -1504,7 +1504,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// look up the current character's category, which tells us
// which column in the state table to look at.
//
category = (short) fRData.fTrie.getCodePointValue(c);
category = (short) fRData.fTrie.get(c);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:37a4591c3864eb4156671247d3ee06169dd489f65854000badacf0ef7b334a9d
size 12127235
oid sha256:d315546f344483688e78322304130697164e0d0363b20ed00880598630632341
size 12128031

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3c8e833db5b89ace58dde83087599684772a68742408e8a99c9a43b1c9d00c7f
size 92449
oid sha256:17fb194e1234c73ab09442acf76f1b872d77d8aa7494a06f5964f1342616d69e
size 92448