ICU-7273 build data for CanonicalIterator start sets on the fly; replace remaining uses of NormalizerImpl

X-SVN-Rev: 27561
This commit is contained in:
Markus Scherer 2010-02-13 22:13:37 +00:00
parent 0ec6c28016
commit b15f884b16
12 changed files with 183 additions and 986 deletions

View file

@ -33,6 +33,8 @@ public final class Normalizer2Impl {
public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
@ -502,25 +504,19 @@ public final class Normalizer2Impl {
canonStartSets=new ArrayList<UnicodeSet>();
Iterator<Trie2.Range> trieIterator=normTrie.iterator();
while(trieIterator.hasNext()) {
Trie2.Range range=trieIterator.next();
int norm16=range.value;
if(norm16==0) {
continue; // inert
}
if(norm16==minYesNo) {
// Hangul LV & LVT: Set has-compositions for all syllables
// to minimize the trie size, although only LV syllables
// do have compositions. Handle at runtime.
// Set the same value for the whole range because
// there cannot be other data. Hangul syllables are segment starters,
// and since they decompose they cannot have canonStartSets.
// (There is no decomposable character in a decomposition mapping.)
range.value=CANON_HAS_COMPOSITIONS;
newData.setRange(range, true);
final Trie2.Range range=trieIterator.next();
final int norm16=range.value;
if(range.leadSurrogate || norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
// Inert, or 2-way mapping (including Hangul syllable).
// We do not write a canonStartSet for any yesNo character.
// Composites from 2-way mappings are added at runtime from the
// starter's compositions list, and the other characters in
// 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
// "maybe" characters.
continue;
}
for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
int oldValue=newData.get(c);
final int oldValue=newData.get(c);
int newValue=oldValue;
if(norm16>=minMaybeYes) {
// not a segment starter if it occurs in a decomposition or has cc!=0
@ -531,36 +527,39 @@ public final class Normalizer2Impl {
} else if(norm16<minYesNo) {
newValue|=CANON_HAS_COMPOSITIONS;
} else {
// c has a decomposition
// c has a one-way decomposition
int c2=c;
while(limitNoNo<=norm16 && norm16<minMaybeYes) {
c2=this.mapAlgorithmic(c2, norm16);
norm16=getNorm16(c2);
int norm16_2=norm16;
while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
c2=this.mapAlgorithmic(c2, norm16_2);
norm16_2=getNorm16(c2);
}
if(minYesNo<=norm16 && norm16<limitNoNo) {
if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
// c decomposes, get everything from the variable-length extra data
int firstUnit=extraData.charAt(norm16++);
if(c==c2 && (firstUnit&MAPPING_PLUS_COMPOSITION_LIST)!=0) {
newValue|=CANON_HAS_COMPOSITIONS; // original c has compositions
}
int firstUnit=extraData.charAt(norm16_2++);
int length=firstUnit&MAPPING_LENGTH_MASK;
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
if(c==c2 && (extraData.charAt(norm16)&0xff)!=0) {
if(c==c2 && (extraData.charAt(norm16_2)&0xff)!=0) {
newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
}
++norm16;
++norm16_2;
}
// Skip empty mappings (no characters in the decomposition).
if(length!=0) {
// add c to first code point's start set
int limit=norm16+length;
c2=extraData.codePointAt(norm16);
int limit=norm16_2+length;
c2=extraData.codePointAt(norm16_2);
addToStartSet(newData, c, c2);
// set CANON_NOT_SEGMENT_STARTER for each remaining code point
while((norm16+=Character.charCount(c2))<limit) {
c2=extraData.codePointAt(norm16);
int c2Value=newData.get(c2);
if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
// Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
// one-way mapping. A 2-way mapping is possible here after
// intermediate algorithmic mapping.
if(norm16_2>=minNoNo) {
while((norm16_2+=Character.charCount(c2))<limit) {
c2=extraData.codePointAt(norm16_2);
int c2Value=newData.get(c2);
if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
}
}
}
}
@ -692,6 +691,29 @@ public final class Normalizer2Impl {
public boolean isCanonSegmentStarter(int c) {
return canonIterData.get(c)>=0;
}
public boolean getCanonStartSet(int c, UnicodeSet set) {
int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
if(canonValue==0) {
return false;
}
set.clear();
int value=canonValue&CANON_VALUE_MASK;
if((canonValue&CANON_HAS_SET)!=0) {
set.addAll(canonStartSets.get(value));
} else if(value!=0) {
set.add(value);
}
if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
int norm16=getNorm16(c);
if(norm16==JAMO_L) {
int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
} else {
addComposites(getCompositionsList(norm16), set);
}
}
return true;
}
public static final int MIN_CCC_LCCC_CP=0x300;
@ -1503,7 +1525,7 @@ public final class Normalizer2Impl {
/**
* @return index into maybeYesCompositions, or -1
*/
private int getCompositionsListForDecompYesAndZeroCC(int norm16) {
private int getCompositionsListForDecompYes(int norm16) {
if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
return -1;
} else {
@ -1527,6 +1549,15 @@ public final class Normalizer2Impl {
(firstUnit&MAPPING_LENGTH_MASK)+ // + mapping length
((firstUnit>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD
}
/**
* @param c code point must have compositions
* @return index into maybeYesCompositions
*/
private int getCompositionsList(int norm16) {
return isDecompYes(norm16) ?
getCompositionsListForDecompYes(norm16) :
getCompositionsListForComposite(norm16);
}
// Decompose a short piece of text which is likely to contain characters that
// fail the quick check loop and/or where the quick check loop's overhead
@ -1639,6 +1670,29 @@ public final class Normalizer2Impl {
}
return -1;
}
/**
* @param c Character which has compositions
* @param set recursively receives the composites from c's compositions
*/
private void addComposites(int list, UnicodeSet set) {
int firstUnit, compositeAndFwd;
do {
firstUnit=maybeYesCompositions.charAt(list);
if((firstUnit&COMP_1_TRIPLE)==0) {
compositeAndFwd=maybeYesCompositions.charAt(list+1);
list+=2;
} else {
compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
maybeYesCompositions.charAt(list+2);
list+=3;
}
int composite=compositeAndFwd>>1;
if((compositeAndFwd&1)!=0) {
addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
}
set.add(composite);
} while((firstUnit&COMP_1_LAST_TUPLE)==0);
}
/*
* Recomposes the buffer text starting at recomposeStartIndex
* (which is in NFD - decomposed and canonically ordered),
@ -1777,7 +1831,7 @@ public final class Normalizer2Impl {
// If c did not combine, then check if it is a starter.
if(cc==0) {
// Found a new starter.
if((compositionsList=getCompositionsListForDecompYesAndZeroCC(norm16))>=0) {
if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
// It may combine with something, prepare for it.
if(c<=0xffff) {
starterIsSupplementary=false;

View file

@ -1,427 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 1996-2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.*;
import com.ibm.icu.impl.ICUDebug;
/**
* @version 1.0
* @author Ram Viswanadha
*/
/*
* Description of the format of unorm.icu version 2.1.
*
* Main change from version 1 to version 2:
* Use of new, common Trie instead of normalization-specific tries.
* Change to version 2.1: add third/auxiliary trie with associated data.
*
* For more details of how to use the data structures see the code
* in unorm.cpp (runtime normalization code) and
* in gennorm.c and gennorm/store.c (build-time data generation).
*
* For the serialized format of Trie see Trie.c/TrieHeader.
*
* - Overall partition
*
* unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
* After that there are the following structures:
*
* char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
*
* Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
*
* char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
* extraData[0] contains the number of units for
* FC_NFKC_Closure (formatVersion>=2.1)
*
* char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
* combiningTableTop may include one 16-bit padding unit
* to make sure that fcdTrie is 32-bit-aligned
*
* Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
*
* Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
*
* char canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[INDEX_CANON_SET_COUNT]
* serialized USets, see uset.c
*
*
* The indexes array contains lengths and sizes of the following arrays and structures
* as well as the following values:
* indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
* -- one more than the highest combining index computed for forward-only-combining characters
* indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
* -- number of combining indexes computed for both-ways-combining characters
* indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
* -- number of combining indexes computed for backward-only-combining characters
*
* indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
* -- first code point with a quick check NF* value of NO/MAYBE
*
*
* - Tries
*
* The main structures are two Trie tables ("compact arrays"),
* each with one index array and one data array.
* See Trie.h and Trie.c.
*
*
* - Tries in unorm.icu
*
* The first trie (normTrie above)
* provides data for the NF* quick checks and normalization.
* The second trie (fcdTrie above) provides data just for FCD checks.
*
*
* - norm32 data words from the first trie
*
* The norm32Table contains one 32-bit word "norm32" per code point.
* It contains the following bit fields:
* 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
* if this index is <EXTRA_INDEX_TOP then it is an index into
* extraData[] where variable-length normalization data for this
* code point is found
* if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
* then this is a norm32 for a leading surrogate, and the index
* value is used together with the following trailing surrogate
* code unit in the second trie access
* if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
* then this is a norm32 for a "special" character,
* i.e., the character is a Hangul syllable or a Jamo
* see EXTRA_HANGUL etc.
* generally, instead of extracting this index from the norm32 and
* comparing it with the above constants,
* the normalization code compares the entire norm32 value
* with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
*
* 15..8 combining class (cc) according to UnicodeData.txt
*
* 7..6 COMBINES_ANY flags, used in composition to see if a character
* combines with any following or preceding character(s)
* at all
* 7 COMBINES_BACK
* 6 COMBINES_FWD
*
* 5..0 quick check flags, set for "no" or "maybe", with separate flags for
* each normalization form
* the higher bits are "maybe" flags; for NF*D there are no such flags
* the lower bits are "no" flags for all forms, in the same order
* as the "maybe" flags,
* which is (MSB to LSB): NFKD NFD NFKC NFC
* 5..4 QC_ANY_MAYBE
* 3..0 QC_ANY_NO
* see further related constants
*
*
* - Extra data per code point
*
* "Extra data" is referenced by the index in norm32.
* It is variable-length data. It is only present, and only those parts
* of it are, as needed for a given character.
* The norm32 extra data index is added to the beginning of extraData[]
* to get to a vector of 16-bit words with data at the following offsets:
*
* [-1] Combining index for composition.
* Stored only if norm32&COMBINES_ANY .
* [0] Lengths of the canonical and compatibility decomposition strings.
* Stored only if there are decompositions, i.e.,
* if norm32&(QC_NFD|QC_NFKD)
* High byte: length of NFKD, or 0 if none
* Low byte: length of NFD, or 0 if none
* Each length byte also has another flag:
* Bit 7 of a length byte is set if there are non-zero
* combining classes (cc's) associated with the respective
* decomposition. If this flag is set, then the decomposition
* is preceded by a 16-bit word that contains the
* leading and trailing cc's.
* Bits 6..0 of a length byte are the length of the
* decomposition string, not counting the cc word.
* [1..n] NFD
* [n+1..] NFKD
*
* Each of the two decompositions consists of up to two parts:
* - The 16-bit words with the leading and trailing cc's.
* This is only stored if bit 7 of the corresponding length byte
* is set. In this case, at least one of the cc's is not zero.
* High byte: leading cc==cc of the first code point in the decomposition string
* Low byte: trailing cc==cc of the last code point in the decomposition string
* - The decomposition string in UTF-16, with length code units.
*
*
* - Combining indexes and combiningTable[]
*
* Combining indexes are stored at the [-1] offset of the extra data
* if the character combines forward or backward with any other characters.
* They are used for (re)composition in NF*C.
* Values of combining indexes are arranged according to whether a character
* combines forward, backward, or both ways:
* forward-only < both ways < backward-only
*
* The index values for forward-only and both-ways combining characters
* are indexes into the combiningTable[].
* The index values for backward-only combining characters are simply
* incremented from the preceding index values to be unique.
*
* In the combiningTable[], a variable-length list
* of variable-length (back-index, code point) pair entries is stored
* for each forward-combining character.
*
* These back-indexes are the combining indexes of both-ways or backward-only
* combining characters that the forward-combining character combines with.
*
* Each list is sorted in ascending order of back-indexes.
* Each list is terminated with the last back-index having bit 15 set.
*
* Each pair (back-index, code point) takes up either 2 or 3
* 16-bit words.
* The first word of a list entry is the back-index, with its bit 15 set if
* this is the last pair in the list.
*
* The second word contains flags in bits 15..13 that determine
* if there is a third word and how the combined character is encoded:
* 15 set if there is a third word in this list entry
* 14 set if the result is a supplementary character
* 13 set if the result itself combines forward
*
* According to these bits 15..14 of the second word,
* the result character is encoded as follows:
* 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
* the second word.
* 10 The result is 0x2000..0xffff and stored in the third word.
* Bits 12..0 of the second word are not used.
* 11 The result is a supplementary character.
* Bits 9..0 of the leading surrogate are in bits 9..0 of
* the second word.
* Add 0xd800 to these bits to get the complete surrogate.
* Bits 12..10 of the second word are not used.
* The trailing surrogate is stored in the third word.
*
*
* - FCD trie
*
* The FCD trie is very simple.
* It is a folded trie with 16-bit data words.
* In each word, the high byte contains the leading cc of the character,
* and the low byte contains the trailing cc of the character.
* These cc's are the cc's of the first and last code points in the
* canonical decomposition of the character.
*
* Since all 16 bits are used for cc's, lead surrogates must be tested
* by checking the code unit instead of the trie data.
* This is done only if the 16-bit data word is not zero.
* If the code unit is a leading surrogate and the data word is not zero,
* then instead of cc's it contains the offset for the second trie lookup.
*
*
* - Auxiliary trie and data
*
*
* The auxiliary 16-bit trie contains data for additional properties.
* Bits
* 15..13 reserved
* 12 not NFC_Skippable (f) (formatVersion>=2.2)
* 11 flag: not a safe starter for canonical closure
* 10 composition exclusion
* 9.. 0 index into extraData[] to FC_NFKC_Closure string
* (not for lead surrogate),
* or lead surrogate offset (for lead surrogate, if 9..0 not zero)
*
* Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
* (used in NormalizerTransliterator)
*
* A skippable character is
* a) unassigned, or ALL of the following:
* b) of combining class 0.
* c) not decomposed by this normalization form.
* AND if NFC or NFKC,
* d) can never compose with a previous character.
* e) can never compose with a following character.
* f) can never change if another character is added.
* Example: a-breve might satisfy all but f, but if you
* add an ogonek it changes to a-ogonek + breve
*
* a)..e) must be tested from norm32.
* Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
* into the auxiliary trie.
* The same bit is used for NFC and NFKC; (c) differs for them.
* As usual, we build the "not skippable" flags so that unassigned
* code points get a 0 bit.
* This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
* Test Hangul LV syllables entirely in code.
*
*
* - FC_NFKC_Closure strings in extraData[]
*
* Strings are either stored as a single code unit or as the length
* followed by that many units.
*
* - structure inside canonStartSets[]
*
* This array maps from code points c to sets of code points (USerializedSet).
* The result sets are the code points whose canonical decompositions start
* with c.
*
* canonStartSets[] contains the following sub-arrays:
*
* indexes[_NORM_SET_INDEX_TOP]
* - contains lengths of sub-arrays etc.
*
* startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
* - contains serialized sets (USerializedSet) of canonical starters for
* enumerating canonically equivalent strings
* indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
* for details about the structure see uset.c
*
* bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
* - a sorted search table for BMP code points whose results are
* either indexes to USerializedSets or single code points for
* single-code point sets;
* each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
* if yy==01 then there is a USerializedSet at canonStartSets+x
* else build a USerializedSet with result as the single code point
*
* suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
* - a sorted search table for supplementary code points whose results are
* either indexes to USerializedSets or single code points for
* single-code point sets;
* each entry is a triplet of { high16(cp), low16(cp), result }
* each code point's high-word may contain extra data in bits 15..5:
* if the high word has bit 15 set, then build a set with a single code point
* which is (((high16(cp)&0x1f00)<<8)|result;
* else there is a USerializedSet at canonStartSets+result
*/
final class NormalizerDataReader implements ICUBinary.Authenticate {
private final static boolean debug = ICUDebug.enabled("NormalizerDataReader");
/**
* <p>Protected constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
*/
protected NormalizerDataReader(InputStream inputStream)
throws IOException{
if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
dataInputStream = new DataInputStream(inputStream);
if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
}
// protected methods -------------------------------------------------
protected int[] readIndexes(int length)throws IOException{
int[] indexes = new int[length];
//Read the indexes
for (int i = 0; i <length ; i++) {
indexes[i] = dataInputStream.readInt();
}
return indexes;
}
/**
* <p>Reads unorm.icu, parse it into blocks of data to be stored in
* NormalizerImpl.</P
* @param normBytes
* @param fcdBytes
* @param auxBytes
* @param extraData
* @param combiningTable
* @param canonStartSets
* @exception IOException thrown when data reading fails
*/
protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
char[] extraData, char[] combiningTable,
Object[] canonStartSets)
throws IOException
{
// Read the bytes that make up the normTrie
dataInputStream.readFully(normBytes);
// normTrieStream= new ByteArrayInputStream(normBytes);
// Read the extra data
for (int i = 0; i < extraData.length; i++) {
extraData[i] = dataInputStream.readChar();
}
// Read the combining class table
for (int i = 0; i < combiningTable.length; i++) {
combiningTable[i] = dataInputStream.readChar();
}
// Read the fcdTrie
dataInputStream.readFully(fcdBytes);
// Read the AuxTrie
dataInputStream.readFully(auxBytes);
// Read the canonical start sets
int[] canonStartSetsIndexes = new int[NormalizerImpl.SET_INDEX_TOP];
for (int i = 0; i < canonStartSetsIndexes.length; i++) {
canonStartSetsIndexes[i] = dataInputStream.readChar();
}
char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH] - NormalizerImpl.SET_INDEX_TOP];
for (int i = 0; i < startSets.length; i++) {
startSets[i] = dataInputStream.readChar();
}
char[] bmpTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_BMP_TABLE_LENGTH]];
for (int i = 0; i < bmpTable.length; i++) {
bmpTable[i] = dataInputStream.readChar();
}
char[] suppTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SUPP_TABLE_LENGTH]];
for (int i = 0; i < suppTable.length; i++) {
suppTable[i] = dataInputStream.readChar();
}
canonStartSets[NormalizerImpl.CANON_SET_INDICIES_INDEX] = canonStartSetsIndexes;
canonStartSets[NormalizerImpl.CANON_SET_START_SETS_INDEX] = startSets;
canonStartSets[NormalizerImpl.CANON_SET_BMP_TABLE_INDEX] = bmpTable;
canonStartSets[NormalizerImpl.CANON_SET_SUPP_TABLE_INDEX] = suppTable;
}
public byte[] getDataFormatVersion(){
return DATA_FORMAT_VERSION;
}
public boolean isDataVersionAcceptable(byte version[])
{
return version[0] == DATA_FORMAT_VERSION[0]
&& version[2] == DATA_FORMAT_VERSION[2]
&& version[3] == DATA_FORMAT_VERSION[3];
}
public byte[] getUnicodeVersion(){
return unicodeVersion;
}
// private data members -------------------------------------------------
/**
* ICU data file input stream
*/
private DataInputStream dataInputStream;
private byte[] unicodeVersion;
/**
* File format version that this class understands.
* No guarantees are made if a older version is used
* see store.c of gennorm for more information and values
*/
private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
(byte)0x72, (byte)0x6D};
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
(byte)0x5, (byte)0x2};
}

View file

@ -1,384 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.util.MissingResourceException;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.VersionInfo;
/**
* @version 1.0
* @author Ram Viswanadha
*/
public final class NormalizerImpl {
// Static block for the class to initialize its own self
static final NormalizerImpl IMPL;
static
{
try
{
IMPL = new NormalizerImpl();
}
catch (Exception e)
{
throw new MissingResourceException(e.getMessage(), "", "");
}
}
static final int UNSIGNED_BYTE_MASK =0xFF;
static final long UNSIGNED_INT_MASK = 0xffffffffL;
/*
* This new implementation of the normalization code loads its data from
* unorm.icu, which is generated with the gennorm tool.
* The format of that file is described at the end of this file.
*/
private static final String DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE+"/unorm.icu";
/* indexes[] value names */
/* number of bytes in normalization trie */
static final int INDEX_TRIE_SIZE = 0;
/* number of chars in extra data */
static final int INDEX_CHAR_COUNT = 1;
/* number of uint16_t words for combining data */
static final int INDEX_COMBINE_DATA_COUNT = 2;
/* number of code points that combine forward */
static final int INDEX_COMBINE_FWD_COUNT = 3;
/* number of code points that combine forward and backward */
static final int INDEX_COMBINE_BOTH_COUNT = 4;
/* number of code points that combine backward */
static final int INDEX_COMBINE_BACK_COUNT = 5;
/* first code point with quick check NFC NO/MAYBE */
public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
/* first code point with quick check NFKC NO/MAYBE */
public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
/* first code point with quick check NFD NO/MAYBE */
public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
/* first code point with quick check NFKD NO/MAYBE */
public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
/* number of bytes in FCD trie */
static final int INDEX_FCD_TRIE_SIZE = 10;
/* number of bytes in the auxiliary trie */
static final int INDEX_AUX_TRIE_SIZE = 11;
/* number of uint16_t in the array of serialized USet */
static final int INDEX_CANON_SET_COUNT = 12;
/* changing this requires a new formatVersion */
static final int INDEX_TOP = 32;
/* AUX constants */
/* value constants for auxTrie */
private static final int AUX_UNSAFE_SHIFT = 11;
private static final int AUX_COMP_EX_SHIFT = 10;
private static final int AUX_MAX_FNC = 1<<AUX_COMP_EX_SHIFT;
private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
/* canonStartSets[0..31] contains indexes for what is in the array */
/* number of uint16_t in canonical starter sets */
static final int SET_INDEX_CANON_SETS_LENGTH = 0;
/* number of uint16_t in the BMP search table (contains pairs) */
static final int SET_INDEX_CANON_BMP_TABLE_LENGTH = 1;
/* number of uint16_t in the supplementary search table(contains triplets)*/
static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH = 2;
/* changing this requires a new formatVersion */
static final int SET_INDEX_TOP = 32;
static final int CANON_SET_INDICIES_INDEX = 0;
static final int CANON_SET_START_SETS_INDEX = 1;
static final int CANON_SET_BMP_TABLE_INDEX = 2;
static final int CANON_SET_SUPP_TABLE_INDEX = 3;
/* 14 bit indexes to canonical USerializedSets */
static final int CANON_SET_MAX_CANON_SETS = 0x4000;
/* single-code point BMP sets are encoded directly in the search table
* except if result=0x4000..0x7fff
*/
static final int CANON_SET_BMP_MASK = 0xc000;
static final int CANON_SET_BMP_IS_INDEX = 0x4000;
/**
* Internal option for cmpEquivFold() for decomposing.
* If not set, just do strcasecmp().
* @internal
*/
public static final int COMPARE_EQUIV = 0x80000;
/*******************************/
/* Wrappers for Trie implementations */
static final class AuxTrieImpl implements Trie.DataManipulate{
static CharTrie auxTrie = null;
/**
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
* data the index array offset of the indexes for that lead surrogate.
* @param value data value for a surrogate from the trie, including
* the folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
public int getFoldingOffset(int value) {
return (value & AUX_FNC_MASK) << SURROGATE_BLOCK_BITS;
}
}
/****************************************************/
private static AuxTrieImpl auxTrieImpl;
private static int[] indexes;
private static char[] combiningTable;
private static char[] extraData;
private static Object[] canonStartSets;
private static boolean isDataLoaded;
private static boolean isFormatVersion_2_1;
private static byte[] unicodeVersion;
/**
* Default buffer size of datafile
*/
private static final int DATA_BUFFER_SIZE = 25000;
/**
* FCD check: everything below this code point is known to have a 0
* lead combining class
*/
public static final int MIN_WITH_LEAD_CC=0x300;
/** Number of bits of a trail surrogate that are used in index table
* lookups.
*/
private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
// protected constructor ---------------------------------------------
/**
* Constructor
* @exception thrown when data reading fails or data corrupted
*/
private NormalizerImpl() throws IOException {
//data should be loaded only once
if(!isDataLoaded){
// jar access
InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
NormalizerDataReader reader = new NormalizerDataReader(b);
// read the indexes
indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
combiningTable = new char[combiningTableTop];
int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
extraData = new char[extraDataTop];
byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
auxTrieImpl = new AuxTrieImpl();
// load the rest of the data data and initialize the data members
reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable,
canonStartSets);
AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl );
// we reached here without any exceptions so the data is fully
// loaded set the variable to true
isDataLoaded = true;
// get the data format version
byte[] formatVersion = reader.getDataFormatVersion();
isFormatVersion_2_1 =( formatVersion[0]>2
||
(formatVersion[0]==2 && formatVersion[1]>=1)
);
unicodeVersion = reader.getUnicodeVersion();
b.close();
}
}
/* ---------------------------------------------------------------------- */
/* Korean Hangul and Jamo constants */
public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
public static final int HANGUL_BASE=0xac00;
public static final int JAMO_L_COUNT=19;
public static final int JAMO_V_COUNT=21;
public static final int JAMO_T_COUNT=28;
public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
/* data access primitives ----------------------------------------------- */
public static VersionInfo getUnicodeVersion(){
return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
unicodeVersion[2], unicodeVersion[3]);
}
public static boolean isCanonSafeStart(int c) {
if(isFormatVersion_2_1) {
int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
return (aux & AUX_UNSAFE_MASK) == 0;
} else {
return false;
}
}
public static boolean getCanonStartSet(int c, USerializedSet fillSet) {
if(fillSet!=null && canonStartSets!=null) {
/*
* binary search for c
*
* There are two search tables,
* one for BMP code points and one for supplementary ones.
* See unormimp.h for details.
*/
char[] table;
int i=0, start, limit;
int[] idxs = (int[]) canonStartSets[CANON_SET_INDICIES_INDEX];
char[] startSets = (char[]) canonStartSets[CANON_SET_START_SETS_INDEX];
if(c<=0xffff) {
table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
start=0;
limit=table.length;
/* each entry is a pair { c, result } */
while(start<limit-2) {
i=(char)(((start+limit)/4)*2);
if(c<table[i]) {
limit=i;
} else {
start=i;
}
}
//System.out.println(i);
/* found? */
if(c==table[start]) {
i=table[start+1];
if((i & CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) {
/* result 01xxxxxx xxxxxx contains index x to a
* USerializedSet */
i&=(CANON_SET_MAX_CANON_SETS-1);
return fillSet.getSet(startSets,(i-idxs.length));
} else {
/* other result values are BMP code points for
* single-code point sets */
fillSet.setToOne(i);
return true;
}
}
} else {
char high, low, h,j=0;
table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
start=0;
limit=table.length;
high=(char)(c>>16);
low=(char)c;
/* each entry is a triplet { high(c), low(c), result } */
while(start<limit-3) {
/* (start+limit)/2 and address triplets */
i=(char)(((start+limit)/6)*3);
j=(char)(table[i]&0x1f); /* high word */
int tableVal = table[i+1];
int lowInt = low;
if(high<j || ((tableVal>lowInt) && (high==j))) {
limit=i;
} else {
start=i;
}
//System.err.println("\t((high==j) && (table[i+1]>low)) == " + ((high==j) && (tableVal>lowInt)) );
// KLUDGE: IBM JIT in 1.4.0 is sooo broken
// The below lines make TestExhaustive pass
if(ICUDebug.enabled()){
System.err.println("\t\t j = " + Utility.hex(j,4) +
"\t i = " + Utility.hex(i,4) +
"\t high = "+ Utility.hex(high) +
"\t low = " + Utility.hex(lowInt,4) +
"\t table[i+1]: "+ Utility.hex(tableVal,4)
);
}
}
/* found? */
h=table[start];
//System.err.println("c: \\U"+ Integer.toHexString(c)+" i : "+Integer.toHexString(i) +" h : " + Integer.toHexString(h));
int tableVal1 = table[start+1];
int lowInt = low;
if(high==(h&0x1f) && lowInt==tableVal1) {
int tableVal2 = table[start+2];
i=tableVal2;
if((h&0x8000)==0) {
/* the result is an index to a USerializedSet */
return fillSet.getSet(startSets,(i-idxs.length));
} else {
/*
* single-code point set {x} in
* triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
*/
//i|=((int)h & 0x1f00)<<8; /* add high bits from high(c) */
int temp = ((int)h & 0x1f00)<<8;
i|=temp; /* add high bits from high(c) */
fillSet.setToOne(i);
return true;
}
}
}
}
return false; /* not found */
}
public static UnicodeSet addPropertyStarts(UnicodeSet set) {
int c;
/* add the start code point of each same-value range of each trie */
if(isFormatVersion_2_1){
//utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
while(auxIter.next(auxResult)){
set.add(auxResult.start);
}
}
/* add Hangul LV syllables and LV+1 because of skippables */
for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
set.add(c);
set.add(c+1);
}
set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
return set; // for chaining
}
}

View file

@ -103,24 +103,22 @@ public final class UCharacterProperty
public static final int SRC_PROPSVEC=2;
/** From unames.c/unames.icu */
public static final int SRC_NAMES=3;
/** From unorm.cpp/unorm.icu */
public static final int SRC_NORM=4;
/** From ucase.c/ucase.icu */
public static final int SRC_CASE=5;
public static final int SRC_CASE=4;
/** From ubidi_props.c/ubidi.icu */
public static final int SRC_BIDI=6;
public static final int SRC_BIDI=5;
/** From uchar.c/uprops.icu main trie as well as properties vectors trie */
public static final int SRC_CHAR_AND_PROPSVEC=7;
public static final int SRC_CHAR_AND_PROPSVEC=6;
/** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
public static final int SRC_CASE_AND_NORM=8;
public static final int SRC_CASE_AND_NORM=7;
/** From normalizer2impl.cpp/nfc.nrm */
public static final int SRC_NFC=9;
public static final int SRC_NFC=8;
/** From normalizer2impl.cpp/nfkc.nrm */
public static final int SRC_NFKC=10;
public static final int SRC_NFKC=9;
/** From normalizer2impl.cpp/nfkc_cf.nrm */
public static final int SRC_NFKC_CF=11;
public static final int SRC_NFKC_CF=10;
/** One more than the highest UPropertySource (SRC_) constant. */
public static final int SRC_COUNT=12;
public static final int SRC_COUNT=11;
// public methods ----------------------------------------------------
@ -310,7 +308,7 @@ public final class UCharacterProperty
new BinaryProperties( SRC_NFKC, 0 ), /* UCHAR_NFKD_INERT */
new BinaryProperties( SRC_NFC, 0 ), /* UCHAR_NFC_INERT */
new BinaryProperties( SRC_NFKC, 0 ), /* UCHAR_NFKC_INERT */
new BinaryProperties( SRC_NORM, 0 ), /* UCHAR_SEGMENT_STARTER */
new BinaryProperties( SRC_NFC, 0 ), /* UCHAR_SEGMENT_STARTER */
new BinaryProperties( 1, ( 1 << PATTERN_SYNTAX) ),
new BinaryProperties( 1, ( 1 << PATTERN_WHITE_SPACE) ),
new BinaryProperties( SRC_CHAR_AND_PROPSVEC, 0 ), /* UCHAR_POSIX_ALNUM */
@ -372,25 +370,25 @@ public final class UCharacterProperty
} catch (IOException e) {
return false;
}
} else if(column==SRC_NORM) {
/* normalization properties from unorm.icu */
switch(which) {
case UProperty.SEGMENT_STARTER:
return NormalizerImpl.isCanonSafeStart(c);
default:
break;
}
} else if(column==SRC_NFC || column==SRC_NFKC) {
} else if(column==SRC_NFC) {
/* normalization properties from nfc.nrm */
switch(which) {
case UProperty.FULL_COMPOSITION_EXCLUSION: {
// By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
Normalizer2Impl impl=Norm2AllModes.getNFCInstanceNoIOException().impl;
return impl.isCompNo(impl.getNorm16(c));
}
case UProperty.SEGMENT_STARTER:
return Norm2AllModes.getNFCInstanceNoIOException().impl.
ensureCanonIterData().isCanonSegmentStarter(c);
default:
// UCHAR_NF..._INERT properties
// UCHAR_NF[CD]_INERT properties
return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
}
} else if(column==SRC_NFKC) {
/* normalization properties from nfkc.nrm */
// UCHAR_NFK[CD]_INERT properties
return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
} else if(column==SRC_NFKC_CF) {
// currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstanceNoIOException().impl;

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2002-2009, International Business Machines
* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
@ -112,28 +112,26 @@ public final class USerializedSet {
if(rangeIndex<bmpLength) {
range[0]=array[rangeIndex++];
if(rangeIndex<bmpLength) {
range[1]=array[rangeIndex];
range[1]=array[rangeIndex]-1;
} else if(rangeIndex<length) {
range[1]=(((int)array[rangeIndex])<<16)|array[rangeIndex+1];
range[1]=((((int)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
} else {
range[1]=0x110000;
range[1]=0x10ffff;
}
range[1]-=1;
return true;
} else {
rangeIndex-=bmpLength;
rangeIndex*=2; /* address pairs of pairs of units */
length-=bmpLength;
if(rangeIndex<length) {
int suppLength=length-bmpLength;
if(rangeIndex<suppLength) {
int offset=arrayOffset+bmpLength;
range[0]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
rangeIndex+=2;
if(rangeIndex<length) {
range[1]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
if(rangeIndex<suppLength) {
range[1]=((((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1])-1;
} else {
range[1]=0x110000;
range[1]=0x10ffff;
}
range[1]-=1;
return true;
} else {
return false;

View file

@ -12,8 +12,8 @@ import java.util.Iterator;
import java.util.List;
import java.util.Set;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.USerializedSet;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
@ -48,6 +48,9 @@ public final class CanonicalIterator {
* @stable ICU 2.4
*/
public CanonicalIterator(String source) {
Norm2AllModes allModes = Norm2AllModes.getNFCInstanceNoIOException();
nfd = allModes.decomp;
nfcImpl = allModes.impl.ensureCanonIterData();
setSource(source);
}
@ -110,7 +113,7 @@ public final class CanonicalIterator {
* @stable ICU 2.4
*/
public void setSource(String newSource) {
source = Normalizer.normalize(newSource, Normalizer.NFD);
source = nfd.normalize(newSource);
done = false;
// catch degenerate case
@ -131,9 +134,9 @@ public final class CanonicalIterator {
int i = UTF16.findOffsetFromCodePoint(source, 1);
for (; i < source.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
if (NormalizerImpl.isCanonSafeStart(cp)) {
for (; i < source.length(); i += Character.charCount(cp)) {
cp = source.codePointAt(i);
if (nfcImpl.isCanonSegmentStarter(cp)) {
segmentList.add(source.substring(start, i)); // add up to i
start = i;
}
@ -226,6 +229,8 @@ public final class CanonicalIterator {
private static boolean SKIP_ZEROS = true;
// fields
private final Normalizer2 nfd;
private final Normalizer2Impl nfcImpl;
private String source;
private boolean done;
private String[][] pieces;
@ -286,37 +291,30 @@ public final class CanonicalIterator {
result.add(segment);
StringBuffer workingBuffer = new StringBuffer();
UnicodeSet starts = new UnicodeSet();
// cycle through all the characters
int cp=0;
int[] range = new int[2];
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
int cp;
for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
// see if any character is at the start of some decomposition
cp = UTF16.charAt(segment, i);
USerializedSet starts = new USerializedSet();
if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
cp = segment.codePointAt(i);
if (!nfcImpl.getCanonStartSet(cp, starts)) {
continue;
}
int j=0;
// if so, see which decompositions match
int rangeCount = starts.countRanges();
for(j = 0; j < rangeCount; ++j) {
starts.getRange(j, range);
int end=range[1];
for (int cp2 = range[0]; cp2 <= end; ++cp2) {
Set<String> remainder = extract(cp2, segment, i, workingBuffer);
if (remainder == null) {
continue;
}
for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
int cp2 = iter.codepoint;
Set<String> remainder = extract(cp2, segment, i, workingBuffer);
if (remainder == null) {
continue;
}
// there were some matches, so add all the possibilities to the set.
String prefix= segment.substring(0,i);
prefix += UTF16.valueOf(cp2);
for (String item : remainder) {
result.add(prefix + item);
}
// there were some matches, so add all the possibilities to the set.
String prefix= segment.substring(0,i);
prefix += UTF16.valueOf(cp2);
for (String item : remainder) {
result.add(prefix + item);
}
}
}
@ -368,8 +366,10 @@ public final class CanonicalIterator {
if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
+ ", " + Utility.hex(segment.substring(segmentPos)));
//String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
String decomp = Normalizer.normalize(comp, Normalizer.NFD);
String decomp = nfcImpl.getDecomposition(comp);
if (decomp == null) {
decomp = UTF16.valueOf(comp);
}
// See if it matches the start of segment (at segmentPos)
boolean ok = false;

View file

@ -6,11 +6,9 @@
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.UCaseProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.VersionInfo;
import java.io.IOException;
import java.nio.CharBuffer;
@ -2403,14 +2401,6 @@ public final class Normalizer implements Cloneable {
}
}
/**
* Fetches the Unicode version burned into the Normalization data file
* @return VersionInfo version information of the normalizer
*/
static VersionInfo getUnicodeVersion() {
return NormalizerImpl.getUnicodeVersion();
}
/**
* An Appendable that writes into a char array with a capacity that may be
* less than array.length.

View file

@ -17,6 +17,7 @@ import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.ICUResourceBundle;
import com.ibm.icu.impl.StringPrepDataReader;
import com.ibm.icu.impl.UBiDiProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterDirection;
import com.ibm.icu.util.VersionInfo;
@ -297,7 +298,7 @@ public final class StringPrep {
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
VersionInfo normUniVer = Normalizer.getUnicodeVersion();
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/

View file

@ -17,7 +17,6 @@ import java.util.TreeSet;
import com.ibm.icu.impl.BMPSet;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.RuleCharacterIterator;
import com.ibm.icu.impl.SortedSetRelation;
import com.ibm.icu.impl.UBiDiProps;
@ -3092,11 +3091,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
UCharacterProperty.INSTANCE.addPropertyStarts(incl);
UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
break;
case UCharacterProperty.SRC_NORM:
NormalizerImpl.addPropertyStarts(incl);
break;
case UCharacterProperty.SRC_CASE_AND_NORM:
NormalizerImpl.addPropertyStarts(incl);
Norm2AllModes.getNFCInstanceNoIOException().impl.addPropertyStarts(incl);
UCaseProps.getSingleton().addPropertyStarts(incl);
break;
case UCharacterProperty.SRC_NFC:

View file

@ -14,12 +14,12 @@ import java.util.Locale;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.UBiDiProps;
import com.ibm.icu.impl.UCaseProps;
import com.ibm.icu.impl.UCharacterName;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.impl.USerializedSet;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
@ -2204,19 +2204,6 @@ public final class UCharacterTest extends TestFmwk
}
}
/* add characters from a serialized set to a normal one */
private static void _setAddSerialized(UnicodeSet set, USerializedSet sset) {
// int start, end;
int i, count;
count=sset.countRanges();
int[] range = new int[2];
for(i=0; i<count; ++i) {
sset.getRange(i,range);
set.add(range[0],range[1]);
}
}
private boolean showADiffB(UnicodeSet a, UnicodeSet b,
String a_name, String b_name,
boolean expect,
@ -2284,7 +2271,6 @@ public final class UCharacterTest extends TestFmwk
public void TestConsistency() throws IOException {
UnicodeSet set1, set2, set3, set4;
USerializedSet sset;
int start, end;
int i, length;
@ -2368,10 +2354,9 @@ public final class UCharacterTest extends TestFmwk
*/
Normalizer2 norm2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
set1=new UnicodeSet();
Norm2AllModes.getNFCInstanceNoIOException().impl.
ensureCanonIterData().getCanonStartSet(0x49, set1);
set2=new UnicodeSet();
sset = new USerializedSet();
NormalizerImpl.getCanonStartSet(0x49,sset);
_setAddSerialized(set1, sset);
/* enumerate all characters that are plausible to be latin letters */
for(start=0xa0; start<0x2000; ++start) {
@ -2869,7 +2854,7 @@ public final class UCharacterTest extends TestFmwk
}
// Testing when "if(ch<NormalizerImpl.JAMO_L_BASE)" is true
for(int i=NormalizerImpl.JAMO_L_BASE-5; i<NormalizerImpl.JAMO_L_BASE; i++){
for(int i=Normalizer2Impl.Hangul.JAMO_L_BASE-5; i<Normalizer2Impl.Hangul.JAMO_L_BASE; i++){
if(UCharacter.getIntPropertyValue(i, UProperty.HANGUL_SYLLABLE_TYPE) != 0){
errln("UCharacter.getIntPropertyValue(ch, type) was suppose to return 0 " +
"when passing ch: " + i + "and type of Property.HANGUL_SYLLABLE_TYPE");
@ -2878,7 +2863,7 @@ public final class UCharacterTest extends TestFmwk
}
// Testing when "else if((ch-=NormalizerImpl.HANGUL_BASE)<0)" is true
for(int i=NormalizerImpl.HANGUL_BASE-5; i<NormalizerImpl.HANGUL_BASE; i++){
for(int i=Normalizer2Impl.Hangul.HANGUL_BASE-5; i<Normalizer2Impl.Hangul.HANGUL_BASE; i++){
if(UCharacter.getIntPropertyValue(i, UProperty.HANGUL_SYLLABLE_TYPE) != 0){
errln("UCharacter.getIntPropertyValue(ch, type) was suppose to return 0 " +
"when passing ch: " + i + "and type of Property.HANGUL_SYLLABLE_TYPE");

View file

@ -11,12 +11,11 @@ import java.text.StringCharacterIterator;
import java.util.Random;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.USerializedSet;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.*;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.text.UTF16;
@ -1998,21 +1997,17 @@ public class BasicTest extends TestFmwk {
// test cases with i and I to make sure Turkic works
char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
USerializedSet sset=new USerializedSet();
UnicodeSet set = new UnicodeSet();
UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet();
Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstanceNoIOException().impl;
nfcImpl.ensureCanonIterData();
String s1, s2;
int start, end;
// collect all sets into one for contiguous output
int[] startEnd = new int[2];
for(i=0; i<iI.length; ++i) {
if(NormalizerImpl.getCanonStartSet(iI[i], sset)) {
count=sset.countRanges();
for(j=0; j<count; ++j) {
sset.getRange(j, startEnd);
set.add(startEnd[0], startEnd[1]);
}
if(nfcImpl.getCanonStartSet(iI[i], iSet)) {
set.addAll(iSet);
}
}
@ -2771,20 +2766,24 @@ public class BasicTest extends TestFmwk {
USerializedSet sset=new USerializedSet();
UnicodeSet set = new UnicodeSet();
int start, end;
char[] serialized = {
0x8007, // length
3, // bmpLength
0xc0, 0xfe, 0xfffc,
1, 9, 0x10, 0xfffc
};
sset.getSet(serialized, 0);
// collect all sets into one for contiguous output
int[] startEnd = new int[2];
if(NormalizerImpl.getCanonStartSet(0x0130, sset)) {
int count=sset.countRanges();
for(int j=0; j<count; ++j) {
sset.getRange(j, startEnd);
set.add(startEnd[0], startEnd[1]);
}
int count=sset.countRanges();
for(int j=0; j<count; ++j) {
sset.getRange(j, startEnd);
set.add(startEnd[0], startEnd[1]);
}
// test all of these precomposed characters
// test all of these characters
UnicodeSetIterator it = new UnicodeSetIterator(set);
while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
start=it.codepoint;
@ -2793,10 +2792,11 @@ public class BasicTest extends TestFmwk {
if(!sset.contains(start)){
errln("USerializedSet.contains failed for "+Utility.hex(start,8));
}
++start;
}
}
}
public void TestReturnFailure(){
char[] term = {'r','\u00e9','s','u','m','\u00e9' };
char[] decomposed_term = new char[10 + term.length + 2];

View file

@ -12,9 +12,6 @@ import java.util.SortedSet;
import java.util.TreeSet;
import java.util.Set;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.CanonicalIterator;
@ -41,17 +38,6 @@ public class TestCanonicalIterator extends TestFmwk {
{"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
};
public void TestOldAndNew() {
Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstanceNoIOException().impl;
nfcImpl.ensureCanonIterData();
for (int c = 0; c <= 0x10ffff; ++c) {
if (nfcImpl.isCanonSegmentStarter(c) != NormalizerImpl.isCanonSafeStart(c)) {
errln(String.format("old!=new segment starter for U+%04x: old %b new %b",
c, NormalizerImpl.isCanonSafeStart(c), nfcImpl.isCanonSegmentStarter(c)));
}
}
}
public void TestExhaustive() {
int counter = 0;
CanonicalIterator it = new CanonicalIterator("");