ICU-3064 updated trie java port

X-SVN-Rev: 12880
This commit is contained in:
Syn Wee Quek 2003-08-20 00:20:37 +00:00
parent 95cf50134e
commit 0e57d10a60
5 changed files with 694 additions and 726 deletions

File diff suppressed because it is too large Load diff

View file

@ -5,8 +5,8 @@
******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java,v $
* $Date: 2002/10/31 01:09:18 $
* $Revision: 1.3 $
* $Date: 2003/08/20 00:19:19 $
* $Revision: 1.4 $
*
******************************************************************************
*/
@ -14,6 +14,7 @@
package com.ibm.icu.impl;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import java.util.Arrays;
/**
@ -31,7 +32,7 @@ import java.util.Arrays;
* <LI>Smaller memory footprint.
* </UL>
* This is a direct port from the ICU4C version
* @version $Revision: 1.3 $
* @version $Revision: 1.4 $
* @author Syn Wee Quek
*/
public class IntTrieBuilder extends TrieBuilder
@ -47,6 +48,7 @@ public class IntTrieBuilder extends TrieBuilder
m_data_ = new int[m_dataCapacity_];
System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);
m_initialValue_ = table.m_initialValue_;
m_leadUnitValue_ = table.m_leadUnitValue_;
}
/**
@ -58,10 +60,11 @@ public class IntTrieBuilder extends TrieBuilder
* @return updated table
*/
public IntTrieBuilder(int aliasdata[], int maxdatalength,
int initialvalue, boolean latin1linear)
int initialvalue, int leadunitvalue,
boolean latin1linear)
{
super();
if (maxdatalength < DATA_BLOCK_LENGTH_ || (latin1linear
if (maxdatalength < DATA_BLOCK_LENGTH || (latin1linear
&& maxdatalength < 1024)) {
throw new IllegalArgumentException(
"Argument maxdatalength is too small");
@ -75,7 +78,7 @@ public class IntTrieBuilder extends TrieBuilder
}
// preallocate and reset the first data block (block index 0)
int j = DATA_BLOCK_LENGTH_;
int j = DATA_BLOCK_LENGTH;
if (latin1linear) {
// preallocate and reset the first block (number 0) and Latin-1
@ -87,7 +90,7 @@ public class IntTrieBuilder extends TrieBuilder
// do this at least for trie->index[0] even if that block is
// only partly used for Latin-1
m_index_[i ++] = j;
j += DATA_BLOCK_LENGTH_;
j += DATA_BLOCK_LENGTH;
} while (i < (256 >> SHIFT_));
}
@ -95,6 +98,7 @@ public class IntTrieBuilder extends TrieBuilder
// reset the initially allocated blocks to the initial value
Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);
m_initialValue_ = initialvalue;
m_leadUnitValue_ = leadunitvalue;
m_dataCapacity_ = maxdatalength;
m_isLatin1Linear_ = latin1linear;
m_isCompacted_ = false;
@ -246,13 +250,135 @@ public class IntTrieBuilder extends TrieBuilder
triedatamanipulate);
}
// public data member ---------------------------------------------
/**
* Set a value in a range of code points [start..limit].
* All code points c with start &lt;= c &lt; limit will get the value if
* overwrite is true or if the old value is 0.
* @param start the first code point to get the value
* @param limit one past the last code point to get the value
* @param value the value
* @param overwrite flag for whether old non-initial values are to be
* overwritten
* @return false if a failure occurred (illegal argument or data array
* overrun)
*/
public boolean setRange(int start, int limit, int value,
boolean overwrite)
{
// repeat value in [start..limit[
// mark index values for repeat-data blocks by setting bit 31 of the
// index values fill around existing values if any, if(overwrite)
// valid, uncompacted trie and valid indexes?
if (m_isCompacted_ || start < UCharacter.MIN_VALUE
|| start > UCharacter.MAX_VALUE || limit < UCharacter.MIN_VALUE
|| limit > (UCharacter.MAX_VALUE + 1) || start > limit) {
return false;
}
if (start == limit) {
return true; // nothing to do
}
if ((start & MASK_) != 0) {
// set partial block at [start..following block boundary[
int block = getDataBlock(start);
if (block < 0) {
return false;
}
int nextStart = (start + DATA_BLOCK_LENGTH) & ~MASK_;
if (nextStart <= limit) {
fillBlock(block, start & MASK_, DATA_BLOCK_LENGTH,
value, overwrite);
start = nextStart;
}
else {
fillBlock(block, start & MASK_, limit & MASK_,
value, overwrite);
return true;
}
}
// number of positions in the last, partial block
int rest = limit & MASK_;
// round down limit to a block boundary
limit &= ~MASK_;
// iterate over all-value blocks
int repeatBlock = 0;
if (value == m_initialValue_) {
// repeatBlock = 0; assigned above
}
else {
repeatBlock = -1;
}
while (start < limit) {
// get index value
int block = m_index_[start >> SHIFT_];
if (block > 0) {
// already allocated, fill in value
fillBlock(block, 0, DATA_BLOCK_LENGTH, value, overwrite);
}
else if (m_data_[-block] != value && (block == 0 || overwrite)) {
// set the repeatBlock instead of the current block 0 or range
// block
if (repeatBlock >= 0) {
m_index_[start >> SHIFT_] = -repeatBlock;
}
else {
// create and set and fill the repeatBlock
repeatBlock = getDataBlock(start);
if (repeatBlock < 0) {
return false;
}
// set the negative block number to indicate that it is a
// repeat block
m_index_[start >> SHIFT_] = -repeatBlock;
fillBlock(repeatBlock, 0, DATA_BLOCK_LENGTH, value, true);
}
}
start += DATA_BLOCK_LENGTH;
}
if (rest > 0) {
// set partial block at [last block boundary..limit[
int block = getDataBlock(start);
if (block < 0) {
return false;
}
fillBlock(block, 0, rest, value, overwrite);
}
return true;
}
// protected data member ------------------------------------------------
protected int m_data_[];
protected int m_initialValue_;
// private data member ------------------------------------------------
private int m_leadUnitValue_;
// private methods ------------------------------------------------------
private int allocDataBlock()
{
int newBlock = m_dataLength_;
int newTop = newBlock + DATA_BLOCK_LENGTH;
if (newTop > m_dataCapacity_) {
// out of memory in the data array
return -1;
}
m_dataLength_ = newTop;
return newBlock;
}
/**
* No error checking for illegal arguments.
* @param ch codepoint to look for
@ -267,18 +393,16 @@ public class IntTrieBuilder extends TrieBuilder
}
// allocate a new data block
int newBlock = m_dataLength_;
int newTop = newBlock + DATA_BLOCK_LENGTH_;
if (newTop > m_dataCapacity_) {
int newBlock = allocDataBlock();
if (newBlock < 0) {
// out of memory in the data array
return -1;
}
m_dataLength_ = newTop;
m_index_[ch] = newBlock;
// copy-on-write for a block from a setRange()
Arrays.fill(m_data_, newBlock, newBlock + DATA_BLOCK_LENGTH_,
m_initialValue_);
System.arraycopy(m_data_, Math.abs(indexValue), m_data_, newBlock,
DATA_BLOCK_LENGTH << 2);
return newBlock;
}
@ -307,34 +431,34 @@ public class IntTrieBuilder extends TrieBuilder
// if Latin-1 is preallocated and linear, then do not compact Latin-1
// data
int overlapStart = DATA_BLOCK_LENGTH_;
int overlapStart = DATA_BLOCK_LENGTH;
if (m_isLatin1Linear_ && SHIFT_ <= 8) {
overlapStart += 256;
}
int newStart = DATA_BLOCK_LENGTH_;
int newStart = DATA_BLOCK_LENGTH;
int prevEnd = newStart - 1;
for (int start = newStart; start < m_dataLength_;) {
// start: index of first entry of current block
// prevEnd: index to last entry of previous block
// newStart: index where the current block is to be moved
// skip blocks that are not used
if (m_map_[start >> SHIFT_] < 0) {
if (m_map_[start >>> SHIFT_] < 0) {
// advance start to the next block
start += DATA_BLOCK_LENGTH_;
start += DATA_BLOCK_LENGTH;
// leave prevEnd and newStart with the previous block!
continue;
}
// search for an identical block
if (start >= overlapStart) {
int i = findSameDataBlock(m_data_, newStart, start,
overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH_);
overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH);
if (i >= 0) {
// found an identical block, set the other block's index
// value for the current block
m_map_[start >> SHIFT_] = i;
m_map_[start >>> SHIFT_] = i;
// advance start to the next block
start += DATA_BLOCK_LENGTH_;
start += DATA_BLOCK_LENGTH;
// leave prevEnd and newStart with the previous block!
continue;
}
@ -347,7 +471,7 @@ public class IntTrieBuilder extends TrieBuilder
if (x == m_data_[prevEnd] && overlap && start >= overlapStart)
{
// overlap by at least one
for (i = 1; i < DATA_BLOCK_LENGTH_
for (i = 1; i < DATA_BLOCK_LENGTH
&& x == m_data_[start + i]
&& x == m_data_[prevEnd - i]; ++ i)
{
@ -358,23 +482,23 @@ public class IntTrieBuilder extends TrieBuilder
}
if (i > 0) {
// some overlap
m_map_[start >> SHIFT_] = newStart - i;
m_map_[start >>> SHIFT_] = newStart - i;
// move the non-overlapping indexes to their new positions
start += i;
for (i = DATA_BLOCK_LENGTH_ - i; i > 0; -- i) {
for (i = DATA_BLOCK_LENGTH - i; i > 0; -- i) {
m_data_[newStart ++] = m_data_[start ++];
}
}
else if (newStart < start) {
// no overlap, just move the indexes to their new positions
m_map_[start >> SHIFT_] = newStart;
for (i = DATA_BLOCK_LENGTH_; i > 0; -- i) {
m_map_[start >>> SHIFT_] = newStart;
for (i = DATA_BLOCK_LENGTH; i > 0; -- i) {
m_data_[newStart ++] = m_data_[start ++];
}
}
else { // no overlap && newStart==start
m_map_[start >> SHIFT_] = start;
newStart += DATA_BLOCK_LENGTH_;
m_map_[start >>> SHIFT_] = start;
newStart += DATA_BLOCK_LENGTH;
start = newStart;
}
@ -382,7 +506,7 @@ public class IntTrieBuilder extends TrieBuilder
}
// now adjust the index (stage 1) table
for (int i = 0; i < m_indexLength_; ++ i) {
m_index_[i] = m_map_[m_index_[i] >>> SHIFT_];
m_index_[i] = m_map_[Math.abs(m_index_[i]) >>> SHIFT_];
}
m_dataLength_ = newStart;
}
@ -398,16 +522,16 @@ public class IntTrieBuilder extends TrieBuilder
int otherBlock, int step)
{
// ensure that we do not even partially get past dataLength
dataLength -= DATA_BLOCK_LENGTH_;
dataLength -= DATA_BLOCK_LENGTH;
for (int block = 0; block <= dataLength; block += step) {
int i = 0;
for (i = 0; i < DATA_BLOCK_LENGTH_; ++ i) {
for (i = 0; i < DATA_BLOCK_LENGTH; ++ i) {
if (data[block + i] != data[otherBlock + i]) {
break;
}
}
if (i == DATA_BLOCK_LENGTH_) {
if (i == DATA_BLOCK_LENGTH) {
return block;
}
}
@ -433,16 +557,33 @@ public class IntTrieBuilder extends TrieBuilder
System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0,
SURROGATE_BLOCK_COUNT_);
// to protect the copied lead surrogate values,
// mark all their indexes as repeat blocks
// (causes copy-on-write)
for (char c = 0xd800; c <= 0xdbff; ++ c) {
int block = index[c >> SHIFT_];
if (block > 0) {
index[c >> SHIFT_] =- block;
// set all values for lead surrogate code *units* to leadUnitValue
// so that by default runtime lookups will find no data for associated
// supplementary code points, unless there is data for such code points
// which will result in a non-zero folding value below that is set for
// the respective lead units
// the above saved the indexes for surrogate code *points*
// fill the indexes with simplified code from utrie_setRange32()
int block = 0;
if (m_leadUnitValue_ == m_initialValue_) {
// leadUnitValue == initialValue, use all-initial-value block
// block = 0; if block here left empty
}
else {
// create and fill the repeatBlock
block = allocDataBlock();
if (block < 0) {
// data table overflow
throw new InternalError("Internal error: Out of memory space");
}
fillBlock(block, 0, DATA_BLOCK_LENGTH, m_leadUnitValue_, true);
// negative block number to indicate that it is a repeat block
block = -block;
}
for (int c = (0xd800 >> SHIFT_); c < (0xdc00 >> SHIFT_); ++ c) {
m_index_[c] = block;
}
// Fold significant index values into the area just after the BMP
// indexes.
// In case the first lead surrogate has significant data,
@ -457,13 +598,16 @@ public class IntTrieBuilder extends TrieBuilder
// there is data, treat the full block for a lead surrogate
c &= ~0x3ff;
// is there an identical index block?
int block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
// get a folded value for [c..c+0x400[ and, if 0, set it for
// the lead surrogate
block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
// get a folded value for [c..c+0x400[ and,
// if different from the value for the lead surrogate code
// point, set it for the lead surrogate code unit
int value = manipulate.getFoldedValue(c,
block + SURROGATE_BLOCK_COUNT_);
if (value != 0) {
if (!setValue(0xd7c0 + (c >> 10), value)) {
if (value != getValue(UTF16.getLeadSurrogate(c))) {
if (!setValue(UTF16.getLeadSurrogate(c), value)) {
// data table overflow
throw new ArrayIndexOutOfBoundsException(
"Data table overflow");
@ -480,7 +624,7 @@ public class IntTrieBuilder extends TrieBuilder
c += 0x400;
}
else {
c += DATA_BLOCK_LENGTH_;
c += DATA_BLOCK_LENGTH;
}
}
@ -505,5 +649,28 @@ public class IntTrieBuilder extends TrieBuilder
indexLength += SURROGATE_BLOCK_COUNT_;
m_indexLength_ = indexLength;
}
/**
* @internal
*/
private void fillBlock(int block, int start, int limit, int value,
boolean overwrite)
{
limit += block;
block += start;
if (overwrite) {
while (block < limit) {
m_data_[block ++] = value;
}
}
else {
while (block < limit) {
if (m_data_[block] == m_initialValue_) {
m_data_[block] = value;
}
++ block;
}
}
}
}

View file

@ -5,8 +5,8 @@
******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/TrieBuilder.java,v $
* $Date: 2002/09/06 01:50:43 $
* $Revision: 1.8 $
* $Date: 2003/08/20 00:19:20 $
* $Revision: 1.9 $
*
******************************************************************************
*/
@ -31,7 +31,7 @@ import java.util.Arrays;
* <LI>Smaller memory footprint.
* </UL>
* This is a direct port from the ICU4C version
* @version $Revision: 1.8 $
* @version $Revision: 1.9 $
* @author Syn Wee Quek
*/
public class TrieBuilder
@ -42,7 +42,7 @@ public class TrieBuilder
* Number of data values in a stage 2 (data array) block. 2, 4, 8, ..,
* 0x200
*/
public static final int DATA_BLOCK_LENGTH_ = 1 << Trie.INDEX_STAGE_1_SHIFT_;
public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_;
// public class declaration ----------------------------------------
@ -141,7 +141,7 @@ public class TrieBuilder
protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_);
/**
* Mask for getting the lower bits from the input index.
* DATA_BLOCK_LENGTH_ - 1.
* DATA_BLOCK_LENGTH - 1.
*/
protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_;
/**
@ -253,10 +253,10 @@ public class TrieBuilder
/**
* Maximum length of the build-time data (stage 2) array.
* The maximum length is 0x110000 + DATA_BLOCK_LENGTH_ + 0x400.
* The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400.
* (Number of Unicode code points + one all-initial-value block +
* possible duplicate entries for 1024 lead surrogates.)
*/
private static final int MAX_BUILD_TIME_DATA_LENGTH_ =
0x110000 + DATA_BLOCK_LENGTH_ + 0x400;
0x110000 + DATA_BLOCK_LENGTH + 0x400;
}

View file

@ -5,8 +5,8 @@
******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/TrieIterator.java,v $
* $Date: 2002/11/16 01:49:26 $
* $Revision: 1.8 $
* $Date: 2003/08/20 00:19:19 $
* $Revision: 1.9 $
*
******************************************************************************
*/
@ -131,7 +131,7 @@ public class TrieIterator implements RangeValueIterator
}
if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
calculateNextBMPElement(element)) {
return true;
return true;
}
calculateNextSupplementaryElement(element);
return true;
@ -268,22 +268,26 @@ public class TrieIterator implements RangeValueIterator
m_nextCodepoint_ ++;
m_nextBlockIndex_ ++;
if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
if (UTF16.getTrailSurrogate(m_nextCodepoint_)
!= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
// this piece is only called when we are in the middle of a lead
// surrogate block
if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
// we have cleared one block
m_nextIndex_ ++;
m_nextTrailIndexOffset_ ++;
if (!checkTrailBlock(currentBlock, currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
}
// we have cleared one block
m_nextIndex_ ++;
m_nextTrailIndexOffset_ ++;
if (!checkTrailBlock(currentBlock, currentValue)) {
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_);
// enumerate supplementary code points
while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
@ -293,10 +297,25 @@ public class TrieIterator implements RangeValueIterator
Trie.INDEX_STAGE_2_SHIFT_;
if (leadBlock == m_trie_.m_dataOffset_) {
// no entries for a whole block of lead surrogates
if (currentValue != m_initialValue_) {
m_nextValue_ = m_initialValue_;
m_nextBlock_ = 0;
m_nextBlockIndex_ = 0;
setResult(element, m_currentCodepoint_, m_nextCodepoint_,
currentValue);
m_currentCodepoint_ = m_nextCodepoint_;
return;
}
nextLead += DATA_BLOCK_LENGTH_;
// number of total affected supplementary codepoints in one
// block
m_nextCodepoint_ += DATA_BLOCK_SUPPLEMENTARY_LENGTH_;
// this is not a simple addition of
// DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
// that we might have moved some of the codepoints
m_nextCodepoint_ = UCharacterProperty.getRawSupplementary(
(char)nextLead,
(char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
continue;
}
if (m_trie_.m_dataManipulate_ == null) {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/CollationParsedRuleBuilder.java,v $
* $Date: 2003/07/16 05:52:08 $
* $Revision: 1.22 $
* $Date: 2003/08/20 00:20:37 $
* $Revision: 1.23 $
*
*******************************************************************************
*/
@ -848,7 +848,7 @@ final class CollationParsedRuleBuilder
boolean inBlockZero = m_mapping_.isInZeroBlock(cp);
int tag = getCETag(value);
if (inBlockZero == true) {
cp += TrieBuilder.DATA_BLOCK_LENGTH_;
cp += TrieBuilder.DATA_BLOCK_LENGTH;
}
else if (!(isSpecial(value) && (tag == CE_IMPLICIT_TAG_
|| tag == CE_NOT_FOUND_TAG_))) {
@ -882,10 +882,10 @@ final class CollationParsedRuleBuilder
m_expansions_ = new Vector();
// Do your own mallocs for the structure, array and have linear
// Latin 1
m_mapping_ = new IntTrieBuilder(null, 0x100000,
RuleBasedCollator.CE_SPECIAL_FLAG_
| (CE_NOT_FOUND_TAG_ << 24),
true);
int trieinitialvalue = RuleBasedCollator.CE_SPECIAL_FLAG_
| (CE_NOT_FOUND_TAG_ << 24);
m_mapping_ = new IntTrieBuilder(null, 0x100000, trieinitialvalue,
trieinitialvalue, true);
m_prefixLookup_ = new Hashtable();
// uhash_open(prefixLookupHash, prefixLookupComp);
m_contractions_ = new ContractionTable(m_mapping_);