ICU-8105 new data structure for Unicode property names data (pnames.icu formatVersion 2); includes new dictionary-type tries (only ByteTrie runtime for now, see ticket #8167); merge branches/markus/pnames2 -r 29097:29250

X-SVN-Rev: 29253
This commit is contained in:
Markus Scherer 2010-12-31 18:36:37 +00:00
parent c04082d93c
commit beb1e5718e
4 changed files with 864 additions and 533 deletions

View file

@ -0,0 +1,638 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* created on: 2010nov23
* created by: Markus W. Scherer
* ported from ICU4C bytetrie.h/.cpp
*/
package com.ibm.icu.impl;
import java.io.IOException;
/**
* Light-weight, non-const reader class for a ByteTrie.
* Traverses a byte-serialized data structure with minimal state,
* for mapping byte sequences to non-negative integer values.
*
* @author Markus W. Scherer
*/
public final class ByteTrie implements Cloneable {
public ByteTrie(byte[] trieBytes, int offset) {
bytes_=trieBytes;
pos_=root_=offset;
remainingMatchLength_=-1;
}
/**
* Clones this trie reader object and its state,
* but not the byte array which will be shared.
* @return A shallow clone of this trie.
*/
@Override
public Object clone() throws CloneNotSupportedException {
return super.clone(); // A shallow copy is just what we need.
}
/**
* Resets this trie to its initial state.
*/
public ByteTrie reset() {
pos_=root_;
remainingMatchLength_=-1;
return this;
}
/**
* ByteTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
*/
public static final class State {
public State() {}
private byte[] bytes;
private int pos;
private int remainingMatchLength;
};
/**
* Saves the state of this trie.
* @see #resetToState
*/
public ByteTrie saveState(State state) /*const*/ {
state.bytes=bytes_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return this;
}
/**
* Resets this trie to the saved state.
* @throws IllegalArgumentException if the state object contains no state,
* or the state of a different trie
* @see #saveState
* @see #reset
*/
public ByteTrie resetToState(State state) {
if(bytes_==state.bytes && bytes_!=null) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
} else {
throw new IllegalArgumentException("incompatible trie state");
}
return this;
}
/**
* Return values for ByteTrie.next(), UCharTrie.next() and similar methods.
*/
public enum Result {
/**
* The input unit(s) did not continue a matching string.
*/
NO_MATCH,
/**
* The input unit(s) continued a matching string
* but there is no value for the string so far.
* (It is a prefix of a longer string.)
*/
NO_VALUE,
/**
* The input unit(s) continued a matching string
* and there is a value for the string so far.
* This value will be returned by getValue().
* No further input byte/unit can continue a matching string.
*/
HAS_FINAL_VALUE,
/**
* The input unit(s) continued a matching string
* and there is a value for the string so far.
* This value will be returned by getValue().
* Another input byte/unit can continue a matching string.
*/
HAS_VALUE;
/**
* Same as (result!=NO_MATCH).
* @return true if the input bytes/units so far are part of a matching string/byte sequence.
*/
public boolean matches() { return ordinal()!=0; }
/**
* Equivalent to (result==HAS_VALUE || result==HAS_FINAL_VALUE).
* @return true if there is a value for the input bytes/units so far.
* @see #getValue
*/
public boolean hasValue() { return ordinal()>=2; }
/**
* Equivalent to (result==NO_VALUE || result==HAS_VALUE).
* @return true if another input byte/unit can continue a matching string.
*/
public boolean hasNext() { return (ordinal()&1)!=0; }
}
/**
* Determines whether the byte sequence so far matches, whether it has a value,
* and whether another input byte can continue a matching byte sequence.
* @return The match/value Result.
*/
public Result current() /*const*/ {
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
} else {
int node;
return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
}
}
/**
* Traverses the trie from the initial state for this input byte.
* Equivalent to reset().next(inByte).
* @return The match/value Result.
*/
public Result first(int inByte) {
remainingMatchLength_=-1;
return nextImpl(root_, inByte);
}
/**
* Traverses the trie from the current state for this input byte.
* @return The match/value Result.
*/
public Result next(int inByte) {
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
}
int length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Remaining part of a linear-match node.
if(inByte==(bytes_[pos++]&0xff)) {
remainingMatchLength_=--length;
pos_=pos;
int node;
return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
} else {
stop();
return Result.NO_MATCH;
}
}
return nextImpl(pos, inByte);
}
/**
* Traverses the trie from the current state for this byte sequence.
* Equivalent to
* <pre>
* Result result=current();
* for(each c in s)
* if((result=next(c))==Result.NO_MATCH) return Result.NO_MATCH;
* return result;
* </pre>
* @return The match/value Result.
*/
// public Result next(const char *s, int length);
/**
* Returns a matching byte sequence's value if called immediately after
* current()/first()/next() returned Result.HAS_VALUE or Result.HAS_FINAL_VALUE.
* getValue() can be called multiple times.
*
* Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
*/
public int getValue() /*const*/ {
int pos=pos_;
int leadByte=bytes_[pos++]&0xff;
assert(leadByte>=kMinValueLead);
return readValue(bytes_, pos, leadByte>>1);
}
/**
* Determines whether all byte sequences reachable from the current state
* map to the same value, and if so, returns that value.
* @return the unique value in bits 32..1 with bit 0 set,
* if all byte sequences reachable from the current state
* map to the same value; otherwise returns 0.
*/
public long getUniqueValue() /*const*/ {
int pos=pos_;
if(pos<0) {
return 0;
}
// Skip the rest of a pending linear-match node.
long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0);
// Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
return (uniqueValue<<31)>>31;
}
/**
* Finds each byte which continues the byte sequence from the current state.
* That is, each byte b for which it would be next(b)!=Result.NO_MATCH now.
* @param out Each next byte is 0-extended to a char and appended to this object.
* (Only uses the out.append(c) method.)
* @return the number of bytes which continue the byte sequence from here
*/
public int getNextBytes(Appendable out) /*const*/ {
int pos=pos_;
if(pos<0) {
return 0;
}
if(remainingMatchLength_>=0) {
append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node.
return 1;
}
int node=bytes_[pos++]&0xff;
if(node>=kMinValueLead) {
if((node&kValueIsFinal)!=0) {
return 0;
} else {
pos=skipValue(pos, node);
node=bytes_[pos++]&0xff;
assert(node<kMinValueLead);
}
}
if(node<kMinLinearMatch) {
if(node==0) {
node=bytes_[pos++]&0xff;
}
getNextBranchBytes(bytes_, pos, ++node, out);
return node;
} else {
// First byte of the linear-match node.
append(out, bytes_[pos]&0xff);
return 1;
}
}
private void stop() {
pos_=-1;
}
// Reads a compact 32-bit integer.
// pos is already after the leadByte, and the lead byte is already shifted right by 1.
private static int readValue(byte[] bytes, int pos, int leadByte) {
int value;
if(leadByte<kMinTwoByteValueLead) {
value=leadByte-kMinOneByteValueLead;
} else if(leadByte<kMinThreeByteValueLead) {
value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff);
} else if(leadByte<kFourByteValueLead) {
value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff);
} else if(leadByte==kFourByteValueLead) {
value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff);
} else {
value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff);
}
return value;
}
private static int skipValue(int pos, int leadByte) {
assert(leadByte>=kMinValueLead);
if(leadByte>=(kMinTwoByteValueLead<<1)) {
if(leadByte<(kMinThreeByteValueLead<<1)) {
++pos;
} else if(leadByte<(kFourByteValueLead<<1)) {
pos+=2;
} else {
pos+=3+((leadByte>>1)&1);
}
}
return pos;
}
private static int skipValue(byte[] bytes, int pos) {
int leadByte=bytes[pos++]&0xff;
return skipValue(pos, leadByte);
}
// Reads a jump delta and jumps.
private static int jumpByDelta(byte[] bytes, int pos) {
int delta=bytes[pos++]&0xff;
if(delta<kMinTwoByteDeltaLead) {
// nothing to do
} else if(delta<kMinThreeByteDeltaLead) {
delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff);
} else if(delta<kFourByteDeltaLead) {
delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff);
pos+=2;
} else if(delta==kFourByteDeltaLead) {
delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff);
pos+=3;
} else {
delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff);
pos+=4;
}
return pos+delta;
}
private static int skipDelta(byte[] bytes, int pos) {
int delta=bytes[pos++]&0xff;
if(delta>=kMinTwoByteDeltaLead) {
if(delta<kMinThreeByteDeltaLead) {
++pos;
} else if(delta<kFourByteDeltaLead) {
pos+=2;
} else {
pos+=3+(delta&1);
}
}
return pos;
}
private static Result[] valueResults_={ Result.HAS_VALUE, Result.HAS_FINAL_VALUE };
// Handles a branch node for both next(byte) and next(string).
private Result branchNext(int pos, int length, int inByte) {
// Branch according to the current byte.
if(length==0) {
length=bytes_[pos++]&0xff;
}
++length;
// The length of the branch is the number of bytes to select from.
// The data structure encodes a binary search.
while(length>kMaxBranchLinearSubNodeLength) {
if(inByte<(bytes_[pos++]&0xff)) {
length>>=1;
pos=jumpByDelta(bytes_, pos);
} else {
length=length-(length>>1);
pos=skipDelta(bytes_, pos);
}
}
// Drop down to linear search for the last few bytes.
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
// and divides length by 2.
do {
if(inByte==(bytes_[pos++]&0xff)) {
Result result;
int node=bytes_[pos]&0xff;
assert(node>=kMinValueLead);
if((node&kValueIsFinal)!=0) {
// Leave the final value for getValue() to read.
result=Result.HAS_FINAL_VALUE;
} else {
// Use the non-final value as the jump delta.
++pos;
// int delta=readValue(pos, node>>1);
node>>=1;
int delta;
if(node<kMinTwoByteValueLead) {
delta=node-kMinOneByteValueLead;
} else if(node<kMinThreeByteValueLead) {
delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff);
} else if(node<kFourByteValueLead) {
delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff);
pos+=2;
} else if(node==kFourByteValueLead) {
delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff);
pos+=3;
} else {
delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff);
pos+=4;
}
// end readValue()
pos+=delta;
node=bytes_[pos]&0xff;
result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
}
pos_=pos;
return result;
}
--length;
pos=skipValue(bytes_, pos);
} while(length>1);
if(inByte==(bytes_[pos++]&0xff)) {
pos_=pos;
int node=bytes_[pos]&0xff;
return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
} else {
stop();
return Result.NO_MATCH;
}
}
// Requires remainingLength_<0.
private Result nextImpl(int pos, int inByte) {
for(;;) {
int node=bytes_[pos++]&0xff;
if(node<kMinLinearMatch) {
return branchNext(pos, node, inByte);
} else if(node<kMinValueLead) {
// Match the first of length+1 bytes.
int length=node-kMinLinearMatch; // Actual match length minus 1.
if(inByte==(bytes_[pos++]&0xff)) {
remainingMatchLength_=--length;
pos_=pos;
return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ?
valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
} else {
// No match.
break;
}
} else if((node&kValueIsFinal)!=0) {
// No further matching bytes.
break;
} else {
// Skip intermediate value.
pos=skipValue(pos, node);
// The next node must not also be a value node.
assert((bytes_[pos]&0xff)<kMinValueLead);
}
}
stop();
return Result.NO_MATCH;
}
// Helper functions for hasUniqueValue().
// Recursively finds a unique value (or whether there is not a unique one)
// from a branch.
// uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
// On return, if not 0, then bits 63..33 contain the updated non-negative pos.
private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length,
long uniqueValue) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue);
if(uniqueValue==0) {
return 0;
}
length=length-(length>>1);
pos=skipDelta(bytes, pos);
}
do {
++pos; // ignore a comparison byte
// handle its value
int node=bytes[pos++]&0xff;
boolean isFinal=(node&kValueIsFinal)!=0;
int value=readValue(bytes, pos, node>>1);
pos=skipValue(pos, node);
if(isFinal) {
if(uniqueValue!=0) {
if(value!=(int)(uniqueValue>>1)) {
return 0;
}
} else {
uniqueValue=((long)value<<1)|1;
}
} else {
uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue);
if(uniqueValue==0) {
return 0;
}
}
} while(--length>1);
// ignore the last comparison byte
return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
}
// Recursively finds a unique value (or whether there is not a unique one)
// starting from a position on a node lead byte.
// uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
// Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) {
for(;;) {
int node=bytes[pos++]&0xff;
if(node<kMinLinearMatch) {
if(node==0) {
node=bytes[pos++]&0xff;
}
uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue);
if(uniqueValue==0) {
return 0;
}
pos=(int)(uniqueValue>>>33);
} else if(node<kMinValueLead) {
// linear-match node
pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
} else {
boolean isFinal=(node&kValueIsFinal)!=0;
int value=readValue(bytes, pos, node>>1);
if(uniqueValue!=0) {
if(value!=(int)(uniqueValue>>1)) {
return 0;
}
} else {
uniqueValue=((long)value<<1)|1;
}
if(isFinal) {
return uniqueValue;
}
pos=skipValue(pos, node);
}
}
}
// Helper functions for getNextBytes().
// getNextBytes() when pos is on a branch node.
private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out);
length=length-(length>>1);
pos=skipDelta(bytes, pos);
}
do {
append(out, bytes[pos++]&0xff);
pos=skipValue(bytes, pos);
} while(--length>1);
append(out, bytes[pos]&0xff);
}
private static void append(Appendable out, int c) {
try {
out.append((char)c);
} catch(IOException e) {
throw new RuntimeException(e);
}
}
// ByteTrie data structure
//
// The trie consists of a series of byte-serialized nodes for incremental
// string/byte sequence matching. The root node is at the beginning of the trie data.
//
// Types of nodes are distinguished by their node lead byte ranges.
// After each node, except a final-value node, another node follows to
// encode match values or continue matching further bytes.
//
// Node types:
// - Value node: Stores a 32-bit integer in a compact, variable-length format.
// The value is for the string/byte sequence so far.
// One node bit indicates whether the value is final or whether
// matching continues with the next node.
// - Linear-match node: Matches a number of bytes.
// - Branch node: Branches to other nodes according to the current input byte.
// The node byte is the length of the branch (number of bytes to select from)
// minus 1. It is followed by a sub-node:
// - If the length is at most kMaxBranchLinearSubNodeLength, then
// there are length-1 (key, value) pairs and then one more comparison byte.
// If one of the key bytes matches, then the value is either a final value for
// the string/byte sequence so far, or a "jump" delta to the next node.
// If the last byte matches, then matching continues with the next node.
// (Values have the same encoding as value nodes.)
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
// there is one byte and one "jump" delta.
// If the input byte is less than the sub-node byte, then "jump" by delta to
// the next sub-node which will have a length of length/2.
// (The delta has its own compact encoding.)
// Otherwise, skip the "jump" delta to the next sub-node
// which will have a length of length-length/2.
// Node lead byte values.
// 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next byte.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
private static final int kMaxBranchLinearSubNodeLength=5;
// 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
private static final int kMinLinearMatch=0x10;
private static final int kMaxLinearMatchLength=0x10;
// 20..ff: Variable-length value node.
// If odd, the value is final. (Otherwise, intermediate value or jump delta.)
// Then shift-right by 1 bit.
// The remaining lead byte value indicates the number of following bytes (0..4)
// and contains the value's top bits.
private static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
// It is a final value if bit 0 is set.
private static final int kValueIsFinal=1;
// Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
private static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10
private static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
private static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
private static final int kMaxTwoByteValue=0x1aff;
private static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
private static final int kFourByteValueLead=0x7e;
// A little more than Unicode code points. (0x11ffff)
/*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
/*package*/ static final int kFiveByteValueLead=0x7f;
// Compact delta integers.
private static final int kMaxOneByteDelta=0xbf;
private static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
private static final int kMinThreeByteDeltaLead=0xf0;
private static final int kFourByteDeltaLead=0xfe;
/*package*/ static final int kFiveByteDeltaLead=0xff;
/*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
/*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
// Fixed value referencing the ByteTrie bytes.
private byte[] bytes_;
private int root_;
// Iterator variables.
// Index of next trie byte to read. Negative if no more matches.
private int pos_;
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
private int remainingMatchLength_;
};

View file

@ -6,6 +6,7 @@
* Author: Alan Liu
* Created: November 5 2002
* Since: ICU 2.4
* 2010nov19 Markus Scherer Rewrite for formatVersion 2.
**********************************************************************
*/
@ -17,7 +18,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.MissingResourceException;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
/**
@ -25,13 +25,12 @@ import com.ibm.icu.lang.UProperty;
* imported from icu4c. It contains property and property value
* aliases from the UCD files PropertyAliases.txt and
* PropertyValueAliases.txt. The file is built by the icu4c tool
* genpname. It must be built on an ASCII big-endian platform to be
* genpname. It must be an ASCII big-endian file to be
* usable in icu4j.
*
* This class performs two functions.
*
* (1) It can import the flat binary data into a tree of usable
* objects.
* (1) It can import the flat binary data into usable objects.
*
* (2) It provides an API to access the tree of objects.
*
@ -39,170 +38,186 @@ import com.ibm.icu.lang.UProperty;
* of icu4c's pnames.icu file.
*
* Each time a UPropertyAliases is constructed, the pnames.icu file is
* read, parsed, and a data tree assembled. Clients should create one
* read, parsed, and data structures assembled. Clients should create one
* singleton instance and cache it.
*
* @author Alan Liu
* @since ICU 2.4
*/
public final class UPropertyAliases implements ICUBinary.Authenticate {
public final class UPropertyAliases {
// Byte offsets from the start of the data, after the generic header.
private static final int IX_VALUE_MAPS_OFFSET=0;
private static final int IX_BYTE_TRIES_OFFSET=1;
private static final int IX_NAME_GROUPS_OFFSET=2;
private static final int IX_RESERVED3_OFFSET=3;
// private static final int IX_RESERVED4_OFFSET=4;
// private static final int IX_TOTAL_SIZE=5;
// Other values.
// private static final int IX_MAX_NAME_LENGTH=6;
// private static final int IX_RESERVED7=7;
// private static final int IX_COUNT=8;
//----------------------------------------------------------------
// Runtime data. This is an unflattened representation of the
// data in pnames.icu.
/**
* Map from property enum value to nameGroupPool[] index
*/
private NonContiguousEnumToShort enumToName;
private int[] valueMaps;
private byte[] byteTries;
private String nameGroups;
/**
* Map from property alias to property enum value
*/
private NameToEnum nameToEnum;
private static final class IsAcceptable implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
return version[0]==2;
}
}
private static final IsAcceptable IS_ACCEPTABLE=new IsAcceptable();
private static final byte DATA_FORMAT[]={ 0x70, 0x6E, 0x61, 0x6D }; // "pnam"
/**
* Map from property enum value to valueMapArray[] index
*/
private NonContiguousEnumToShort enumToValue;
private void load(InputStream data) throws IOException {
BufferedInputStream bis=new BufferedInputStream(data);
//dataVersion=ICUBinary.readHeaderAndDataVersion(bis, DATA_FORMAT, IS_ACCEPTABLE);
ICUBinary.readHeader(bis, DATA_FORMAT, IS_ACCEPTABLE);
DataInputStream ds=new DataInputStream(bis);
int indexesLength=ds.readInt()/4; // inIndexes[IX_VALUE_MAPS_OFFSET]/4
if(indexesLength<8) { // formatVersion 2 initially has 8 indexes
throw new IOException("pnames.icu: not enough indexes");
}
int[] inIndexes=new int[indexesLength];
inIndexes[0]=indexesLength*4;
for(int i=1; i<indexesLength; ++i) {
inIndexes[i]=ds.readInt();
}
/**
* Each entry represents a binary or enumerated property
*/
private ValueMap valueMapArray[];
// Read the valueMaps.
int offset=inIndexes[IX_VALUE_MAPS_OFFSET];
int nextOffset=inIndexes[IX_BYTE_TRIES_OFFSET];
int numInts=(nextOffset-offset)/4;
valueMaps=new int[numInts];
for(int i=0; i<numInts; ++i) {
valueMaps[i]=ds.readInt();
}
/**
* Pool of concatenated integer runs. Each run contains one
* or more entries. The last entry of the run is negative.
* A zero entry indicates "n/a" in the Property*Aliases.txt.
* Each entry is a stringPool[] index.
*/
private short nameGroupPool[];
// Read the byteTries.
offset=nextOffset;
nextOffset=inIndexes[IX_NAME_GROUPS_OFFSET];
int numBytes=nextOffset-offset;
byteTries=new byte[numBytes];
ds.readFully(byteTries);
/**
* Pool of strings.
*/
private String stringPool[];
// Read the nameGroups and turn them from ASCII bytes into a Java String.
offset=nextOffset;
nextOffset=inIndexes[IX_RESERVED3_OFFSET];
numBytes=nextOffset-offset;
StringBuilder sb=new StringBuilder(numBytes);
for(int i=0; i<numBytes; ++i) {
sb.append((char)ds.readByte());
}
nameGroups=sb.toString();
//----------------------------------------------------------------
// Constants
data.close();
}
/**
* Debug flag (not really constant)
*/
private static boolean DEBUG = ICUDebug.enabled("pnames");
/**
* File format that this class understands.
* See icu4c/src/common/propname.h.
*/
private static final byte DATA_FORMAT_ID[] = {'p', 'n', 'a', 'm'};
/**
* File version that this class understands.
* See icu4c/src/common/propname.h.
*/
private static final byte DATA_FORMAT_VERSION = 1;
/**
* Name of the datafile
*/
private static final String DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE+"/pnames.icu";
/**
* Buffer size of datafile. The whole file is < 16k.
*/
private static final int DATA_BUFFER_SIZE = 8192;
//----------------------------------------------------------------
// Constructor
/**
* Constructs a UPropertyAliases object. The binary file
* DATA_FILE_NAME is read from the jar/classpath and unflattened
* into member variables of this object.
*/
private UPropertyAliases() throws IOException {
load(ICUData.getRequiredStream(ICUResourceBundle.ICU_BUNDLE+"/pnames.icu"));
}
// Open the .icu file from the jar/classpath
InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME);
BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE);
// Read and discard Unicode version...
/* byte unicodeVersion[] = */ICUBinary.readHeader(b, DATA_FORMAT_ID, this);
DataInputStream d = new DataInputStream(b);
// Record the origin position of the file. Keep enough around
// to seek back to the start of the header.
d.mark(256);
short enumToName_offset = d.readShort();
short nameToEnum_offset = d.readShort();
short enumToValue_offset = d.readShort();
short total_size = d.readShort();
short valueMap_offset = d.readShort();
short valueMap_count = d.readShort();
short nameGroupPool_offset = d.readShort();
short nameGroupPool_count = d.readShort();
short stringPool_offset = d.readShort();
short stringPool_count = d.readShort();
if (DEBUG) {
System.out.println(
"enumToName_offset=" + enumToName_offset + "\n" +
"nameToEnum_offset=" + nameToEnum_offset + "\n" +
"enumToValue_offset=" + enumToValue_offset + "\n" +
"total_size=" + total_size + "\n" +
"valueMap_offset=" + valueMap_offset + "\n" +
"valueMap_count=" + valueMap_count + "\n" +
"nameGroupPool_offset=" + nameGroupPool_offset + "\n" +
"nameGroupPool_count=" + nameGroupPool_count + "\n" +
"stringPool_offset=" + stringPool_offset + "\n" +
"stringPool_count=" + stringPool_count);
private int findProperty(int property) {
int i=1; // valueMaps index, initially after numRanges
for(int numRanges=valueMaps[0]; numRanges>0; --numRanges) {
// Read and skip the start and limit of this range.
int start=valueMaps[i];
int limit=valueMaps[i+1];
i+=2;
if(property<start) {
break;
}
if(property<limit) {
return i+(property-start)*2;
}
i+=(limit-start)*2; // Skip all entries for this range.
}
return 0;
}
// Read it all (less than 32k). Seeking around (using
// mark/reset/skipBytes) doesn't work directly on the file,
// but it works fine if we read everything into a byte[] array
// first.
byte raw[] = new byte[total_size];
d.reset();
d.readFully(raw);
d.close();
Builder builder = new Builder(raw);
stringPool = builder.readStringPool(stringPool_offset,
stringPool_count);
nameGroupPool = builder.readNameGroupPool(nameGroupPool_offset,
nameGroupPool_count);
builder.setupValueMap_map(valueMap_offset, valueMap_count);
// Some of the following data structures have to be set up
// here, _not_ in Builder. That's because they are instances
// of non-static inner classes, and they contain implicit
// references to this.
builder.seek(enumToName_offset);
enumToName = new NonContiguousEnumToShort(builder);
builder.nameGroupOffsetToIndex(enumToName.offsetArray);
builder.seek(nameToEnum_offset);
nameToEnum = new NameToEnum(builder);
builder.seek(enumToValue_offset);
enumToValue = new NonContiguousEnumToShort(builder);
builder.valueMapOffsetToIndex(enumToValue.offsetArray);
valueMapArray = new ValueMap[valueMap_count];
for (int i=0; i<valueMap_count; ++i) {
// Must seek to the start of each entry.
builder.seek(builder.valueMap_map[i]);
valueMapArray[i] = new ValueMap(builder);
private int findPropertyValueNameGroup(int valueMapIndex, int value) {
if(valueMapIndex==0) {
return 0; // The property does not have named values.
}
++valueMapIndex; // Skip the ByteTrie offset.
int numRanges=valueMaps[valueMapIndex++];
if(numRanges<0x10) {
// Ranges of values.
for(; numRanges>0; --numRanges) {
// Read and skip the start and limit of this range.
int start=valueMaps[valueMapIndex];
int limit=valueMaps[valueMapIndex+1];
valueMapIndex+=2;
if(value<start) {
break;
}
if(value<limit) {
return valueMaps[valueMapIndex+value-start];
}
valueMapIndex+=limit-start; // Skip all entries for this range.
}
} else {
// List of values.
int valuesStart=valueMapIndex;
int nameGroupOffsetsStart=valueMapIndex+numRanges-0x10;
do {
int v=valueMaps[valueMapIndex];
if(value<v) {
break;
}
if(value==v) {
return valueMaps[nameGroupOffsetsStart+valueMapIndex-valuesStart];
}
} while(++valueMapIndex<nameGroupOffsetsStart);
}
return 0;
}
builder.close();
private String getName(int nameGroupsIndex, int nameIndex) {
int numNames=nameGroups.charAt(nameGroupsIndex++);
if(nameIndex<0 || numNames<=nameIndex) {
throw new IllegalIcuArgumentException("Invalid property (value) name choice");
}
// Skip nameIndex names.
for(; nameIndex>0; --nameIndex) {
while(0!=nameGroups.charAt(nameGroupsIndex++)) {}
}
// Find the end of this name.
int nameStart=nameGroupsIndex;
while(0!=nameGroups.charAt(nameGroupsIndex)) {
++nameGroupsIndex;
}
if(nameStart==nameGroupsIndex) {
return null; // no name (Property[Value]Aliases.txt has "n/a")
}
return nameGroups.substring(nameStart, nameGroupsIndex);
}
private static int asciiToLowercase(int c) {
return 'A'<=c && c<='Z' ? c+0x20 : c;
}
private boolean containsName(ByteTrie trie, CharSequence name) {
ByteTrie.Result result=ByteTrie.Result.NO_VALUE;
for(int i=0; i<name.length(); ++i) {
int c=name.charAt(i);
// Ignore delimiters '-', '_', and ASCII White_Space.
if(c=='-' || c=='_' || c==' ' || (0x09<=c && c<=0x0d)) {
continue;
}
if(!result.hasNext()) {
return false;
}
c=asciiToLowercase(c);
result=trie.next(c);
}
return result.hasValue();
}
//----------------------------------------------------------------
@ -215,194 +230,86 @@ public final class UPropertyAliases implements ICUBinary.Authenticate {
INSTANCE = new UPropertyAliases();
} catch(IOException e) {
///CLOVER:OFF
throw new MissingResourceException("Could not construct UPropertyAliases. Missing pnames.icu","","");
MissingResourceException mre = new MissingResourceException(
"Could not construct UPropertyAliases. Missing pnames.icu", "", "");
mre.initCause(e);
throw mre;
///CLOVER:ON
}
}
/**
* Return a property name given a property enum. Multiple
* names may be available for each property; the nameChoice
* selects among them.
* Returns a property name given a property enum.
* Multiple names may be available for each property;
* the nameChoice selects among them.
*/
public String getPropertyName(int property,
int nameChoice) {
short nameGroupIndex = enumToName.getShort(property);
return chooseNameInGroup(nameGroupIndex, nameChoice);
public String getPropertyName(int property, int nameChoice) {
int valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
throw new IllegalArgumentException(
"Invalid property enum "+property+" (0x"+Integer.toHexString(property)+")");
}
return getName(valueMaps[valueMapIndex], nameChoice);
}
/**
* Return a property enum given one of its property names.
* Returns a value name given a property enum and a value enum.
* Multiple names may be available for each value;
* the nameChoice selects among them.
*/
public String getPropertyValueName(int property, int value, int nameChoice) {
int valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
throw new IllegalArgumentException(
"Invalid property enum "+property+" (0x"+Integer.toHexString(property)+")");
}
int nameGroupOffset=findPropertyValueNameGroup(valueMaps[valueMapIndex+1], value);
if(nameGroupOffset==0) {
throw new IllegalArgumentException(
"Property "+property+" (0x"+Integer.toHexString(property)+
") does not have named values");
}
return getName(nameGroupOffset, nameChoice);
}
private int getPropertyOrValueEnum(int byteTrieOffset, CharSequence alias) {
ByteTrie trie=new ByteTrie(byteTries, byteTrieOffset);
if(containsName(trie, alias)) {
return trie.getValue();
} else {
return UProperty.UNDEFINED;
}
}
/**
* Returns a property enum given one of its property names.
* If the property name is not known, this method returns
* UProperty.UNDEFINED.
*/
public int getPropertyEnum(String propertyAlias) {
return nameToEnum.getEnum(propertyAlias);
public int getPropertyEnum(CharSequence alias) {
return getPropertyOrValueEnum(0, alias);
}
/**
* Return a value name given a property enum and a value enum.
* Multiple names may be available for each value; the nameChoice
* selects among them.
* Returns a value enum given a property enum and one of its value names.
*/
public String getPropertyValueName(int property,
int value,
int nameChoice) {
ValueMap vm = getValueMap(property);
short nameGroupIndex = vm.enumToName.getShort(value);
return chooseNameInGroup(nameGroupIndex, nameChoice);
}
/**
* Return a value enum given one of its value names and the
* corresponding property alias.
*/
public int getPropertyValueEnum(int property,
String valueAlias) {
ValueMap vm = getValueMap(property);
return vm.nameToEnum.getEnum(valueAlias);
}
//----------------------------------------------------------------
// Data structures
/**
* A map for the legal values of a binary or enumerated properties.
*/
private class ValueMap {
/**
* Maps value enum to index into the nameGroupPool[]
*/
EnumToShort enumToName; // polymorphic
/**
* Maps value name to value enum.
*/
NameToEnum nameToEnum;
ValueMap(Builder b) throws IOException {
short enumToName_offset = b.readShort();
short ncEnumToName_offset = b.readShort();
short nameToEnum_offset = b.readShort();
if (enumToName_offset != 0) {
b.seek(enumToName_offset);
ContiguousEnumToShort x = new ContiguousEnumToShort(b);
b.nameGroupOffsetToIndex(x.offsetArray);
enumToName = x;
} else {
b.seek(ncEnumToName_offset);
NonContiguousEnumToShort x = new NonContiguousEnumToShort(b);
b.nameGroupOffsetToIndex(x.offsetArray);
enumToName = x;
}
b.seek(nameToEnum_offset);
nameToEnum = new NameToEnum(b);
public int getPropertyValueEnum(int property, CharSequence alias) {
int valueMapIndex=findProperty(property);
if(valueMapIndex==0) {
throw new IllegalArgumentException(
"Invalid property enum "+property+" (0x"+Integer.toHexString(property)+")");
}
valueMapIndex=valueMaps[valueMapIndex+1];
if(valueMapIndex==0) {
throw new IllegalArgumentException(
"Property "+property+" (0x"+Integer.toHexString(property)+
") does not have named values");
}
// valueMapIndex is the start of the property's valueMap,
// where the first word is the ByteTrie offset.
return getPropertyOrValueEnum(valueMaps[valueMapIndex], alias);
}
/**
* Abstract map from enum values to integers.
*/
private interface EnumToShort {
short getShort(int enumProbe);
}
/**
* Generic map from enum values to offsets. Enum values are
* contiguous.
*/
private static class ContiguousEnumToShort implements EnumToShort {
int enumStart;
int enumLimit;
short offsetArray[];
public short getShort(int enumProbe) {
if (enumProbe < enumStart || enumProbe >= enumLimit) {
throw new IllegalIcuArgumentException("Invalid enum. enumStart = " +enumStart +
" enumLimit = " + enumLimit +
" enumProbe = " + enumProbe );
}
return offsetArray[enumProbe - enumStart];
}
ContiguousEnumToShort(ICUBinaryStream s) throws IOException {
enumStart = s.readInt();
enumLimit = s.readInt();
int count = enumLimit - enumStart;
offsetArray = new short[count];
for (int i=0; i<count; ++i) {
offsetArray[i] = s.readShort();
}
}
}
/**
* Generic map from enum values to offsets. Enum values need not
* be contiguous.
*/
private static class NonContiguousEnumToShort implements EnumToShort {
int enumArray[];
short offsetArray[];
public short getShort(int enumProbe) {
for (int i=0; i<enumArray.length; ++i) {
if (enumArray[i] < enumProbe) continue;
if (enumArray[i] > enumProbe) break;
return offsetArray[i];
}
throw new IllegalIcuArgumentException("Invalid enum");
}
NonContiguousEnumToShort(ICUBinaryStream s) throws IOException {
int i;
int count = s.readInt();
enumArray = new int[count];
offsetArray = new short[count];
for (i=0; i<count; ++i) {
enumArray[i] = s.readInt();
}
for (i=0; i<count; ++i) {
offsetArray[i] = s.readShort();
}
}
}
/**
* Map from names to enum values.
*/
private class NameToEnum {
int enumArray[];
short nameArray[];
int getEnum(String nameProbe) {
for (int i=0; i<nameArray.length; ++i) {
int c = UPropertyAliases.compare(nameProbe,
stringPool[nameArray[i]]);
if (c > 0) continue;
if (c < 0) break;
return enumArray[i];
}
return UProperty.UNDEFINED;
}
NameToEnum(Builder b) throws IOException {
int i;
int count = b.readInt();
enumArray = new int[count];
nameArray = new short[count];
for (i=0; i<count; ++i) {
enumArray[i] = b.readInt();
}
for (i=0; i<count; ++i) {
nameArray[i] = b.stringOffsetToIndex(b.readShort());
}
}
}
//----------------------------------------------------------------
// Runtime implementation
/**
* Compare two property names, returning <0, 0, or >0. The
* comparison is that described as "loose" matching in the
@ -447,7 +354,7 @@ public final class UPropertyAliases implements ICUBinary.Authenticate {
cstrb = 0;
}
rc = UCharacter.toLowerCase(cstra) - UCharacter.toLowerCase(cstrb);
rc = asciiToLowercase(cstra) - asciiToLowercase(cstrb);
if (rc != 0) {
return rc;
}
@ -456,218 +363,4 @@ public final class UPropertyAliases implements ICUBinary.Authenticate {
++istrb;
}
}
/**
* Given an index to a run within the nameGroupPool[], and a
* nameChoice (0,1,...), select the nameChoice-th entry of the run.
*/
private String chooseNameInGroup(short nameGroupIndex, int nameChoice) {
if (nameChoice < 0) {
throw new IllegalIcuArgumentException("Invalid name choice");
}
while (nameChoice-- > 0) {
if (nameGroupPool[nameGroupIndex++] < 0) {
throw new IllegalIcuArgumentException("Invalid name choice");
}
}
short a = nameGroupPool[nameGroupIndex];
return stringPool[(a < 0) ? -a : a];
}
/**
* Return the valueMap[] entry for a given property.
*/
private ValueMap getValueMap(int property) {
int valueMapIndex = enumToValue.getShort(property);
return valueMapArray[valueMapIndex];
}
//----------------------------------------------------------------
// ICUBinary API
/**
* Return true if the given data version can be used.
*/
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == DATA_FORMAT_VERSION;
}
//----------------------------------------------------------------
// Builder
/**
* A specialized ICUBinaryStream that can map between offsets and
* index values into various arrays (stringPool, nameGroupPool,
* and valueMap). It also knows how to read various structures.
*/
static class Builder extends ICUBinaryStream {
// map[i] = offset of object i. We need maps for all of our
// arrays. The arrays are indexed by offset in the raw binary
// file; we need to translate that to index.
private short stringPool_map[];
private short valueMap_map[];
private short nameGroup_map[];
public Builder(byte raw[]) {
super(raw);
}
/**
* The valueMap_map[] must be setup in advance. This method
* does that.
*/
public void setupValueMap_map(short offset, short count) {
valueMap_map = new short[count];
for (int i=0; i<count; ++i) {
// Start of each entry. Each entry is 6 bytes long.
valueMap_map[i] = (short) (offset + i * 6);
}
}
/**
* Read stringPool[]. Build up translation table from offsets
* to string indices (stringPool_map[]).
*/
public String[] readStringPool(short offset, short count)
throws IOException {
seek(offset);
// Allocate one more stringPool entry than needed. Use this
// to store a "no string" entry in the pool, at index 0. This
// maps to offset 0, so let stringPool_map[0] = 0.
String stringPool[] = new String[count + 1];
stringPool_map = new short[count + 1];
short pos = offset;
StringBuilder buf = new StringBuilder();
stringPool_map[0] = 0;
for (int i=1; i<=count; ++i) {
buf.setLength(0);
for (;;) {
// This works because the name is invariant-ASCII
char c = (char) readUnsignedByte();
if (c == 0) break;
buf.append(c);
}
stringPool_map[i] = pos;
stringPool[i] = buf.toString();
pos += stringPool[i].length() + 1;
}
if (DEBUG) {
System.out.println("read stringPool x " + count +
": " + stringPool[1] + ", " +
stringPool[2] + ", " +
stringPool[3] + ",...");
}
return stringPool;
}
/**
* Read the nameGroupPool[], and build up the offset->index
* map (nameGroupPool_map[]).
*/
public short[] readNameGroupPool(short offset, short count)
throws IOException {
// Read nameGroupPool[]. This contains offsets from start of
// header. We translate these into indices into stringPool[]
// on the fly. The offset 0, which indicates "no entry", we
// translate into index 0, which contains a null String
// pointer.
seek(offset);
short pos = offset;
short nameGroupPool[] = new short[count];
nameGroup_map = new short[count];
for (int i=0; i<count; ++i) {
nameGroup_map[i] = pos;
nameGroupPool[i] = stringOffsetToIndex(readShort());
pos += 2;
}
if (DEBUG) {
System.out.println("read nameGroupPool x " + count +
": " + nameGroupPool[0] + ", " +
nameGroupPool[1] + ", " +
nameGroupPool[2] + ",...");
}
return nameGroupPool;
}
/**
* Convert an offset into the string pool into a stringPool[]
* index.
*/
private short stringOffsetToIndex(short offset) {
int probe = offset;
if (probe < 0) probe = -probe;
for (int i=0; i<stringPool_map.length; ++i) {
if (stringPool_map[i] == probe) {
return (short) ((offset < 0) ? -i : i);
}
}
throw new IllegalStateException("Can't map string pool offset " +
offset + " to index");
}
/**
* Convert an array of offsets into the string pool into an
* array of stringPool[] indices. MODIFIES THE ARRAY IN
* PLACE.
*/
/* private void stringOffsetToIndex(short array[]) {
for (int i=0; i<array.length; ++i) {
array[i] = stringOffsetToIndex(array[i]);
}
}*/
/**
* Convert an offset into the value map into a valueMap[]
* index.
*/
private short valueMapOffsetToIndex(short offset) {
for (short i=0; i<valueMap_map.length; ++i) {
if (valueMap_map[i] == offset) {
return i;
}
}
throw new IllegalStateException("Can't map value map offset " +
offset + " to index");
}
/**
* Convert an array of offsets into the value map array into
* an array of valueMap[] indices. MODIFIES THE ARRAY IN
* PLACE.
*/
private void valueMapOffsetToIndex(short array[]) {
for (int i=0; i<array.length; ++i) {
array[i] = valueMapOffsetToIndex(array[i]);
}
}
/**
* Convert an offset into the name group pool into a
* nameGroupPool[] index.
*/
private short nameGroupOffsetToIndex(short offset) {
for (short i=0; i<nameGroup_map.length; ++i) {
if (nameGroup_map[i] == offset) {
return i;
}
}
throw new RuntimeException("Can't map name group offset " + offset +
" to index");
}
/**
* Convert an array of offsets into the name group pool into an
* array of nameGroupPool[] indices. MODIFIES THE ARRAY IN
* PLACE.
*/
private void nameGroupOffsetToIndex(short array[]) {
for (int i=0; i<array.length; ++i) {
array[i] = nameGroupOffsetToIndex(array[i]);
}
}
}
}

View file

@ -4206,7 +4206,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
* @see UProperty
* @stable ICU 2.4
*/
public static int getPropertyEnum(String propertyAlias) {
public static int getPropertyEnum(CharSequence propertyAlias) {
int propEnum = UPropertyAliases.INSTANCE.getPropertyEnum(propertyAlias);
if (propEnum == UProperty.UNDEFINED) {
throw new IllegalIcuArgumentException("Invalid name: " + propertyAlias);
@ -4318,7 +4318,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
* selector or valueAlias is not a value of this property
* @stable ICU 2.4
*/
public static int getPropertyValueEnum(int property, String valueAlias) {
public static int getPropertyValueEnum(int property, CharSequence valueAlias) {
int propEnum = UPropertyAliases.INSTANCE.getPropertyValueEnum(property, valueAlias);
if (propEnum == UProperty.UNDEFINED) {
throw new IllegalIcuArgumentException("Invalid name: " + valueAlias);

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bbc7ef8ee9ef1d9992f7a2836e8e6ada320802fbc64bfc8d48966ce705581781
size 6739159
oid sha256:a5ad97a089e050870cd3e20b5b0c2d38651637e576c9020edc55431e62c31b56
size 6740512