mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-16 10:17:23 +00:00
ICU-7273 build data for CanonicalIterator start sets on the fly; replace remaining uses of NormalizerImpl
X-SVN-Rev: 27561
This commit is contained in:
parent
0ec6c28016
commit
b15f884b16
12 changed files with 183 additions and 986 deletions
|
@ -33,6 +33,8 @@ public final class Normalizer2Impl {
|
|||
public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT;
|
||||
public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT;
|
||||
|
||||
public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT;
|
||||
|
||||
public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
|
||||
public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT;
|
||||
|
||||
|
@ -502,25 +504,19 @@ public final class Normalizer2Impl {
|
|||
canonStartSets=new ArrayList<UnicodeSet>();
|
||||
Iterator<Trie2.Range> trieIterator=normTrie.iterator();
|
||||
while(trieIterator.hasNext()) {
|
||||
Trie2.Range range=trieIterator.next();
|
||||
int norm16=range.value;
|
||||
if(norm16==0) {
|
||||
continue; // inert
|
||||
}
|
||||
if(norm16==minYesNo) {
|
||||
// Hangul LV & LVT: Set has-compositions for all syllables
|
||||
// to minimize the trie size, although only LV syllables
|
||||
// do have compositions. Handle at runtime.
|
||||
// Set the same value for the whole range because
|
||||
// there cannot be other data. Hangul syllables are segment starters,
|
||||
// and since they decompose they cannot have canonStartSets.
|
||||
// (There is no decomposable character in a decomposition mapping.)
|
||||
range.value=CANON_HAS_COMPOSITIONS;
|
||||
newData.setRange(range, true);
|
||||
final Trie2.Range range=trieIterator.next();
|
||||
final int norm16=range.value;
|
||||
if(range.leadSurrogate || norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) {
|
||||
// Inert, or 2-way mapping (including Hangul syllable).
|
||||
// We do not write a canonStartSet for any yesNo character.
|
||||
// Composites from 2-way mappings are added at runtime from the
|
||||
// starter's compositions list, and the other characters in
|
||||
// 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are
|
||||
// "maybe" characters.
|
||||
continue;
|
||||
}
|
||||
for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
|
||||
int oldValue=newData.get(c);
|
||||
final int oldValue=newData.get(c);
|
||||
int newValue=oldValue;
|
||||
if(norm16>=minMaybeYes) {
|
||||
// not a segment starter if it occurs in a decomposition or has cc!=0
|
||||
|
@ -531,36 +527,39 @@ public final class Normalizer2Impl {
|
|||
} else if(norm16<minYesNo) {
|
||||
newValue|=CANON_HAS_COMPOSITIONS;
|
||||
} else {
|
||||
// c has a decomposition
|
||||
// c has a one-way decomposition
|
||||
int c2=c;
|
||||
while(limitNoNo<=norm16 && norm16<minMaybeYes) {
|
||||
c2=this.mapAlgorithmic(c2, norm16);
|
||||
norm16=getNorm16(c2);
|
||||
int norm16_2=norm16;
|
||||
while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) {
|
||||
c2=this.mapAlgorithmic(c2, norm16_2);
|
||||
norm16_2=getNorm16(c2);
|
||||
}
|
||||
if(minYesNo<=norm16 && norm16<limitNoNo) {
|
||||
if(minYesNo<=norm16_2 && norm16_2<limitNoNo) {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
int firstUnit=extraData.charAt(norm16++);
|
||||
if(c==c2 && (firstUnit&MAPPING_PLUS_COMPOSITION_LIST)!=0) {
|
||||
newValue|=CANON_HAS_COMPOSITIONS; // original c has compositions
|
||||
}
|
||||
int firstUnit=extraData.charAt(norm16_2++);
|
||||
int length=firstUnit&MAPPING_LENGTH_MASK;
|
||||
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
|
||||
if(c==c2 && (extraData.charAt(norm16)&0xff)!=0) {
|
||||
if(c==c2 && (extraData.charAt(norm16_2)&0xff)!=0) {
|
||||
newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
|
||||
}
|
||||
++norm16;
|
||||
++norm16_2;
|
||||
}
|
||||
// Skip empty mappings (no characters in the decomposition).
|
||||
if(length!=0) {
|
||||
// add c to first code point's start set
|
||||
int limit=norm16+length;
|
||||
c2=extraData.codePointAt(norm16);
|
||||
int limit=norm16_2+length;
|
||||
c2=extraData.codePointAt(norm16_2);
|
||||
addToStartSet(newData, c, c2);
|
||||
// set CANON_NOT_SEGMENT_STARTER for each remaining code point
|
||||
while((norm16+=Character.charCount(c2))<limit) {
|
||||
c2=extraData.codePointAt(norm16);
|
||||
int c2Value=newData.get(c2);
|
||||
if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
|
||||
newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
|
||||
// Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a
|
||||
// one-way mapping. A 2-way mapping is possible here after
|
||||
// intermediate algorithmic mapping.
|
||||
if(norm16_2>=minNoNo) {
|
||||
while((norm16_2+=Character.charCount(c2))<limit) {
|
||||
c2=extraData.codePointAt(norm16_2);
|
||||
int c2Value=newData.get(c2);
|
||||
if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
|
||||
newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -692,6 +691,29 @@ public final class Normalizer2Impl {
|
|||
public boolean isCanonSegmentStarter(int c) {
|
||||
return canonIterData.get(c)>=0;
|
||||
}
|
||||
public boolean getCanonStartSet(int c, UnicodeSet set) {
|
||||
int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER;
|
||||
if(canonValue==0) {
|
||||
return false;
|
||||
}
|
||||
set.clear();
|
||||
int value=canonValue&CANON_VALUE_MASK;
|
||||
if((canonValue&CANON_HAS_SET)!=0) {
|
||||
set.addAll(canonStartSets.get(value));
|
||||
} else if(value!=0) {
|
||||
set.add(value);
|
||||
}
|
||||
if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
|
||||
int norm16=getNorm16(c);
|
||||
if(norm16==JAMO_L) {
|
||||
int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT;
|
||||
set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1);
|
||||
} else {
|
||||
addComposites(getCompositionsList(norm16), set);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static final int MIN_CCC_LCCC_CP=0x300;
|
||||
|
||||
|
@ -1503,7 +1525,7 @@ public final class Normalizer2Impl {
|
|||
/**
|
||||
* @return index into maybeYesCompositions, or -1
|
||||
*/
|
||||
private int getCompositionsListForDecompYesAndZeroCC(int norm16) {
|
||||
private int getCompositionsListForDecompYes(int norm16) {
|
||||
if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
|
||||
return -1;
|
||||
} else {
|
||||
|
@ -1527,6 +1549,15 @@ public final class Normalizer2Impl {
|
|||
(firstUnit&MAPPING_LENGTH_MASK)+ // + mapping length
|
||||
((firstUnit>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD
|
||||
}
|
||||
/**
|
||||
* @param c code point must have compositions
|
||||
* @return index into maybeYesCompositions
|
||||
*/
|
||||
private int getCompositionsList(int norm16) {
|
||||
return isDecompYes(norm16) ?
|
||||
getCompositionsListForDecompYes(norm16) :
|
||||
getCompositionsListForComposite(norm16);
|
||||
}
|
||||
|
||||
// Decompose a short piece of text which is likely to contain characters that
|
||||
// fail the quick check loop and/or where the quick check loop's overhead
|
||||
|
@ -1639,6 +1670,29 @@ public final class Normalizer2Impl {
|
|||
}
|
||||
return -1;
|
||||
}
|
||||
/**
|
||||
* @param c Character which has compositions
|
||||
* @param set recursively receives the composites from c's compositions
|
||||
*/
|
||||
private void addComposites(int list, UnicodeSet set) {
|
||||
int firstUnit, compositeAndFwd;
|
||||
do {
|
||||
firstUnit=maybeYesCompositions.charAt(list);
|
||||
if((firstUnit&COMP_1_TRIPLE)==0) {
|
||||
compositeAndFwd=maybeYesCompositions.charAt(list+1);
|
||||
list+=2;
|
||||
} else {
|
||||
compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)|
|
||||
maybeYesCompositions.charAt(list+2);
|
||||
list+=3;
|
||||
}
|
||||
int composite=compositeAndFwd>>1;
|
||||
if((compositeAndFwd&1)!=0) {
|
||||
addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
|
||||
}
|
||||
set.add(composite);
|
||||
} while((firstUnit&COMP_1_LAST_TUPLE)==0);
|
||||
}
|
||||
/*
|
||||
* Recomposes the buffer text starting at recomposeStartIndex
|
||||
* (which is in NFD - decomposed and canonically ordered),
|
||||
|
@ -1777,7 +1831,7 @@ public final class Normalizer2Impl {
|
|||
// If c did not combine, then check if it is a starter.
|
||||
if(cc==0) {
|
||||
// Found a new starter.
|
||||
if((compositionsList=getCompositionsListForDecompYesAndZeroCC(norm16))>=0) {
|
||||
if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) {
|
||||
// It may combine with something, prepare for it.
|
||||
if(c<=0xffff) {
|
||||
starterIsSupplementary=false;
|
||||
|
|
|
@ -1,427 +0,0 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
import java.io.*;
|
||||
import com.ibm.icu.impl.ICUDebug;
|
||||
|
||||
/**
|
||||
* @version 1.0
|
||||
* @author Ram Viswanadha
|
||||
*/
|
||||
|
||||
/*
|
||||
* Description of the format of unorm.icu version 2.1.
|
||||
*
|
||||
* Main change from version 1 to version 2:
|
||||
* Use of new, common Trie instead of normalization-specific tries.
|
||||
* Change to version 2.1: add third/auxiliary trie with associated data.
|
||||
*
|
||||
* For more details of how to use the data structures see the code
|
||||
* in unorm.cpp (runtime normalization code) and
|
||||
* in gennorm.c and gennorm/store.c (build-time data generation).
|
||||
*
|
||||
* For the serialized format of Trie see Trie.c/TrieHeader.
|
||||
*
|
||||
* - Overall partition
|
||||
*
|
||||
* unorm.icu customarily begins with a UDataInfo structure, see udata.h and .c.
|
||||
* After that there are the following structures:
|
||||
*
|
||||
* char indexes[INDEX_TOP]; -- INDEX_TOP=32, see enum in this file
|
||||
*
|
||||
* Trie normTrie; -- size in bytes=indexes[INDEX_TRIE_SIZE]
|
||||
*
|
||||
* char extraData[extraDataTop]; -- extraDataTop=indexes[INDEX_UCHAR_COUNT]
|
||||
* extraData[0] contains the number of units for
|
||||
* FC_NFKC_Closure (formatVersion>=2.1)
|
||||
*
|
||||
* char combiningTable[combiningTableTop]; -- combiningTableTop=indexes[INDEX_COMBINE_DATA_COUNT]
|
||||
* combiningTableTop may include one 16-bit padding unit
|
||||
* to make sure that fcdTrie is 32-bit-aligned
|
||||
*
|
||||
* Trie fcdTrie; -- size in bytes=indexes[INDEX_FCD_TRIE_SIZE]
|
||||
*
|
||||
* Trie auxTrie; -- size in bytes=indexes[INDEX_AUX_TRIE_SIZE]
|
||||
*
|
||||
* char canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[INDEX_CANON_SET_COUNT]
|
||||
* serialized USets, see uset.c
|
||||
*
|
||||
*
|
||||
* The indexes array contains lengths and sizes of the following arrays and structures
|
||||
* as well as the following values:
|
||||
* indexes[INDEX_COMBINE_FWD_COUNT]=combineFwdTop
|
||||
* -- one more than the highest combining index computed for forward-only-combining characters
|
||||
* indexes[INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
|
||||
* -- number of combining indexes computed for both-ways-combining characters
|
||||
* indexes[INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
|
||||
* -- number of combining indexes computed for backward-only-combining characters
|
||||
*
|
||||
* indexes[INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
|
||||
* -- first code point with a quick check NF* value of NO/MAYBE
|
||||
*
|
||||
*
|
||||
* - Tries
|
||||
*
|
||||
* The main structures are two Trie tables ("compact arrays"),
|
||||
* each with one index array and one data array.
|
||||
* See Trie.h and Trie.c.
|
||||
*
|
||||
*
|
||||
* - Tries in unorm.icu
|
||||
*
|
||||
* The first trie (normTrie above)
|
||||
* provides data for the NF* quick checks and normalization.
|
||||
* The second trie (fcdTrie above) provides data just for FCD checks.
|
||||
*
|
||||
*
|
||||
* - norm32 data words from the first trie
|
||||
*
|
||||
* The norm32Table contains one 32-bit word "norm32" per code point.
|
||||
* It contains the following bit fields:
|
||||
* 31..16 extra data index, EXTRA_SHIFT is used to shift this field down
|
||||
* if this index is <EXTRA_INDEX_TOP then it is an index into
|
||||
* extraData[] where variable-length normalization data for this
|
||||
* code point is found
|
||||
* if this index is <EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
|
||||
* then this is a norm32 for a leading surrogate, and the index
|
||||
* value is used together with the following trailing surrogate
|
||||
* code unit in the second trie access
|
||||
* if this index is >=EXTRA_INDEX_TOP+EXTRA_SURROGATE_TOP
|
||||
* then this is a norm32 for a "special" character,
|
||||
* i.e., the character is a Hangul syllable or a Jamo
|
||||
* see EXTRA_HANGUL etc.
|
||||
* generally, instead of extracting this index from the norm32 and
|
||||
* comparing it with the above constants,
|
||||
* the normalization code compares the entire norm32 value
|
||||
* with MIN_SPECIAL, SURROGATES_TOP, MIN_HANGUL etc.
|
||||
*
|
||||
* 15..8 combining class (cc) according to UnicodeData.txt
|
||||
*
|
||||
* 7..6 COMBINES_ANY flags, used in composition to see if a character
|
||||
* combines with any following or preceding character(s)
|
||||
* at all
|
||||
* 7 COMBINES_BACK
|
||||
* 6 COMBINES_FWD
|
||||
*
|
||||
* 5..0 quick check flags, set for "no" or "maybe", with separate flags for
|
||||
* each normalization form
|
||||
* the higher bits are "maybe" flags; for NF*D there are no such flags
|
||||
* the lower bits are "no" flags for all forms, in the same order
|
||||
* as the "maybe" flags,
|
||||
* which is (MSB to LSB): NFKD NFD NFKC NFC
|
||||
* 5..4 QC_ANY_MAYBE
|
||||
* 3..0 QC_ANY_NO
|
||||
* see further related constants
|
||||
*
|
||||
*
|
||||
* - Extra data per code point
|
||||
*
|
||||
* "Extra data" is referenced by the index in norm32.
|
||||
* It is variable-length data. It is only present, and only those parts
|
||||
* of it are, as needed for a given character.
|
||||
* The norm32 extra data index is added to the beginning of extraData[]
|
||||
* to get to a vector of 16-bit words with data at the following offsets:
|
||||
*
|
||||
* [-1] Combining index for composition.
|
||||
* Stored only if norm32&COMBINES_ANY .
|
||||
* [0] Lengths of the canonical and compatibility decomposition strings.
|
||||
* Stored only if there are decompositions, i.e.,
|
||||
* if norm32&(QC_NFD|QC_NFKD)
|
||||
* High byte: length of NFKD, or 0 if none
|
||||
* Low byte: length of NFD, or 0 if none
|
||||
* Each length byte also has another flag:
|
||||
* Bit 7 of a length byte is set if there are non-zero
|
||||
* combining classes (cc's) associated with the respective
|
||||
* decomposition. If this flag is set, then the decomposition
|
||||
* is preceded by a 16-bit word that contains the
|
||||
* leading and trailing cc's.
|
||||
* Bits 6..0 of a length byte are the length of the
|
||||
* decomposition string, not counting the cc word.
|
||||
* [1..n] NFD
|
||||
* [n+1..] NFKD
|
||||
*
|
||||
* Each of the two decompositions consists of up to two parts:
|
||||
* - The 16-bit words with the leading and trailing cc's.
|
||||
* This is only stored if bit 7 of the corresponding length byte
|
||||
* is set. In this case, at least one of the cc's is not zero.
|
||||
* High byte: leading cc==cc of the first code point in the decomposition string
|
||||
* Low byte: trailing cc==cc of the last code point in the decomposition string
|
||||
* - The decomposition string in UTF-16, with length code units.
|
||||
*
|
||||
*
|
||||
* - Combining indexes and combiningTable[]
|
||||
*
|
||||
* Combining indexes are stored at the [-1] offset of the extra data
|
||||
* if the character combines forward or backward with any other characters.
|
||||
* They are used for (re)composition in NF*C.
|
||||
* Values of combining indexes are arranged according to whether a character
|
||||
* combines forward, backward, or both ways:
|
||||
* forward-only < both ways < backward-only
|
||||
*
|
||||
* The index values for forward-only and both-ways combining characters
|
||||
* are indexes into the combiningTable[].
|
||||
* The index values for backward-only combining characters are simply
|
||||
* incremented from the preceding index values to be unique.
|
||||
*
|
||||
* In the combiningTable[], a variable-length list
|
||||
* of variable-length (back-index, code point) pair entries is stored
|
||||
* for each forward-combining character.
|
||||
*
|
||||
* These back-indexes are the combining indexes of both-ways or backward-only
|
||||
* combining characters that the forward-combining character combines with.
|
||||
*
|
||||
* Each list is sorted in ascending order of back-indexes.
|
||||
* Each list is terminated with the last back-index having bit 15 set.
|
||||
*
|
||||
* Each pair (back-index, code point) takes up either 2 or 3
|
||||
* 16-bit words.
|
||||
* The first word of a list entry is the back-index, with its bit 15 set if
|
||||
* this is the last pair in the list.
|
||||
*
|
||||
* The second word contains flags in bits 15..13 that determine
|
||||
* if there is a third word and how the combined character is encoded:
|
||||
* 15 set if there is a third word in this list entry
|
||||
* 14 set if the result is a supplementary character
|
||||
* 13 set if the result itself combines forward
|
||||
*
|
||||
* According to these bits 15..14 of the second word,
|
||||
* the result character is encoded as follows:
|
||||
* 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
|
||||
* the second word.
|
||||
* 10 The result is 0x2000..0xffff and stored in the third word.
|
||||
* Bits 12..0 of the second word are not used.
|
||||
* 11 The result is a supplementary character.
|
||||
* Bits 9..0 of the leading surrogate are in bits 9..0 of
|
||||
* the second word.
|
||||
* Add 0xd800 to these bits to get the complete surrogate.
|
||||
* Bits 12..10 of the second word are not used.
|
||||
* The trailing surrogate is stored in the third word.
|
||||
*
|
||||
*
|
||||
* - FCD trie
|
||||
*
|
||||
* The FCD trie is very simple.
|
||||
* It is a folded trie with 16-bit data words.
|
||||
* In each word, the high byte contains the leading cc of the character,
|
||||
* and the low byte contains the trailing cc of the character.
|
||||
* These cc's are the cc's of the first and last code points in the
|
||||
* canonical decomposition of the character.
|
||||
*
|
||||
* Since all 16 bits are used for cc's, lead surrogates must be tested
|
||||
* by checking the code unit instead of the trie data.
|
||||
* This is done only if the 16-bit data word is not zero.
|
||||
* If the code unit is a leading surrogate and the data word is not zero,
|
||||
* then instead of cc's it contains the offset for the second trie lookup.
|
||||
*
|
||||
*
|
||||
* - Auxiliary trie and data
|
||||
*
|
||||
*
|
||||
* The auxiliary 16-bit trie contains data for additional properties.
|
||||
* Bits
|
||||
* 15..13 reserved
|
||||
* 12 not NFC_Skippable (f) (formatVersion>=2.2)
|
||||
* 11 flag: not a safe starter for canonical closure
|
||||
* 10 composition exclusion
|
||||
* 9.. 0 index into extraData[] to FC_NFKC_Closure string
|
||||
* (not for lead surrogate),
|
||||
* or lead surrogate offset (for lead surrogate, if 9..0 not zero)
|
||||
*
|
||||
* Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
|
||||
* (used in NormalizerTransliterator)
|
||||
*
|
||||
* A skippable character is
|
||||
* a) unassigned, or ALL of the following:
|
||||
* b) of combining class 0.
|
||||
* c) not decomposed by this normalization form.
|
||||
* AND if NFC or NFKC,
|
||||
* d) can never compose with a previous character.
|
||||
* e) can never compose with a following character.
|
||||
* f) can never change if another character is added.
|
||||
* Example: a-breve might satisfy all but f, but if you
|
||||
* add an ogonek it changes to a-ogonek + breve
|
||||
*
|
||||
* a)..e) must be tested from norm32.
|
||||
* Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
|
||||
* into the auxiliary trie.
|
||||
* The same bit is used for NFC and NFKC; (c) differs for them.
|
||||
* As usual, we build the "not skippable" flags so that unassigned
|
||||
* code points get a 0 bit.
|
||||
* This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
|
||||
* Test Hangul LV syllables entirely in code.
|
||||
*
|
||||
*
|
||||
* - FC_NFKC_Closure strings in extraData[]
|
||||
*
|
||||
* Strings are either stored as a single code unit or as the length
|
||||
* followed by that many units.
|
||||
*
|
||||
* - structure inside canonStartSets[]
|
||||
*
|
||||
* This array maps from code points c to sets of code points (USerializedSet).
|
||||
* The result sets are the code points whose canonical decompositions start
|
||||
* with c.
|
||||
*
|
||||
* canonStartSets[] contains the following sub-arrays:
|
||||
*
|
||||
* indexes[_NORM_SET_INDEX_TOP]
|
||||
* - contains lengths of sub-arrays etc.
|
||||
*
|
||||
* startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
|
||||
* - contains serialized sets (USerializedSet) of canonical starters for
|
||||
* enumerating canonically equivalent strings
|
||||
* indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
|
||||
* for details about the structure see uset.c
|
||||
*
|
||||
* bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
|
||||
* - a sorted search table for BMP code points whose results are
|
||||
* either indexes to USerializedSets or single code points for
|
||||
* single-code point sets;
|
||||
* each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
|
||||
* if yy==01 then there is a USerializedSet at canonStartSets+x
|
||||
* else build a USerializedSet with result as the single code point
|
||||
*
|
||||
* suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
|
||||
* - a sorted search table for supplementary code points whose results are
|
||||
* either indexes to USerializedSets or single code points for
|
||||
* single-code point sets;
|
||||
* each entry is a triplet of { high16(cp), low16(cp), result }
|
||||
* each code point's high-word may contain extra data in bits 15..5:
|
||||
* if the high word has bit 15 set, then build a set with a single code point
|
||||
* which is (((high16(cp)&0x1f00)<<8)|result;
|
||||
* else there is a USerializedSet at canonStartSets+result
|
||||
*/
|
||||
final class NormalizerDataReader implements ICUBinary.Authenticate {
|
||||
private final static boolean debug = ICUDebug.enabled("NormalizerDataReader");
|
||||
|
||||
/**
|
||||
* <p>Protected constructor.</p>
|
||||
* @param inputStream ICU uprop.dat file input stream
|
||||
* @exception IOException throw if data file fails authentication
|
||||
*/
|
||||
protected NormalizerDataReader(InputStream inputStream)
|
||||
throws IOException{
|
||||
if(debug) System.out.println("Bytes in inputStream " + inputStream.available());
|
||||
|
||||
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
|
||||
|
||||
if(debug) System.out.println("Bytes left in inputStream " +inputStream.available());
|
||||
|
||||
dataInputStream = new DataInputStream(inputStream);
|
||||
|
||||
if(debug) System.out.println("Bytes left in dataInputStream " +dataInputStream.available());
|
||||
}
|
||||
|
||||
// protected methods -------------------------------------------------
|
||||
|
||||
protected int[] readIndexes(int length)throws IOException{
|
||||
int[] indexes = new int[length];
|
||||
//Read the indexes
|
||||
for (int i = 0; i <length ; i++) {
|
||||
indexes[i] = dataInputStream.readInt();
|
||||
}
|
||||
return indexes;
|
||||
}
|
||||
/**
|
||||
* <p>Reads unorm.icu, parse it into blocks of data to be stored in
|
||||
* NormalizerImpl.</P
|
||||
* @param normBytes
|
||||
* @param fcdBytes
|
||||
* @param auxBytes
|
||||
* @param extraData
|
||||
* @param combiningTable
|
||||
* @param canonStartSets
|
||||
* @exception IOException thrown when data reading fails
|
||||
*/
|
||||
protected void read(byte[] normBytes, byte[] fcdBytes, byte[] auxBytes,
|
||||
char[] extraData, char[] combiningTable,
|
||||
Object[] canonStartSets)
|
||||
throws IOException
|
||||
{
|
||||
// Read the bytes that make up the normTrie
|
||||
dataInputStream.readFully(normBytes);
|
||||
|
||||
// normTrieStream= new ByteArrayInputStream(normBytes);
|
||||
|
||||
// Read the extra data
|
||||
for (int i = 0; i < extraData.length; i++) {
|
||||
extraData[i] = dataInputStream.readChar();
|
||||
}
|
||||
|
||||
// Read the combining class table
|
||||
for (int i = 0; i < combiningTable.length; i++) {
|
||||
combiningTable[i] = dataInputStream.readChar();
|
||||
}
|
||||
|
||||
// Read the fcdTrie
|
||||
dataInputStream.readFully(fcdBytes);
|
||||
|
||||
// Read the AuxTrie
|
||||
dataInputStream.readFully(auxBytes);
|
||||
|
||||
// Read the canonical start sets
|
||||
int[] canonStartSetsIndexes = new int[NormalizerImpl.SET_INDEX_TOP];
|
||||
|
||||
for (int i = 0; i < canonStartSetsIndexes.length; i++) {
|
||||
canonStartSetsIndexes[i] = dataInputStream.readChar();
|
||||
}
|
||||
|
||||
char[] startSets = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SETS_LENGTH] - NormalizerImpl.SET_INDEX_TOP];
|
||||
|
||||
for (int i = 0; i < startSets.length; i++) {
|
||||
startSets[i] = dataInputStream.readChar();
|
||||
}
|
||||
char[] bmpTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_BMP_TABLE_LENGTH]];
|
||||
for (int i = 0; i < bmpTable.length; i++) {
|
||||
bmpTable[i] = dataInputStream.readChar();
|
||||
}
|
||||
char[] suppTable = new char[canonStartSetsIndexes[NormalizerImpl.SET_INDEX_CANON_SUPP_TABLE_LENGTH]];
|
||||
for (int i = 0; i < suppTable.length; i++) {
|
||||
suppTable[i] = dataInputStream.readChar();
|
||||
}
|
||||
canonStartSets[NormalizerImpl.CANON_SET_INDICIES_INDEX] = canonStartSetsIndexes;
|
||||
canonStartSets[NormalizerImpl.CANON_SET_START_SETS_INDEX] = startSets;
|
||||
canonStartSets[NormalizerImpl.CANON_SET_BMP_TABLE_INDEX] = bmpTable;
|
||||
canonStartSets[NormalizerImpl.CANON_SET_SUPP_TABLE_INDEX] = suppTable;
|
||||
}
|
||||
|
||||
public byte[] getDataFormatVersion(){
|
||||
return DATA_FORMAT_VERSION;
|
||||
}
|
||||
|
||||
public boolean isDataVersionAcceptable(byte version[])
|
||||
{
|
||||
return version[0] == DATA_FORMAT_VERSION[0]
|
||||
&& version[2] == DATA_FORMAT_VERSION[2]
|
||||
&& version[3] == DATA_FORMAT_VERSION[3];
|
||||
}
|
||||
|
||||
public byte[] getUnicodeVersion(){
|
||||
return unicodeVersion;
|
||||
}
|
||||
// private data members -------------------------------------------------
|
||||
|
||||
|
||||
/**
|
||||
* ICU data file input stream
|
||||
*/
|
||||
private DataInputStream dataInputStream;
|
||||
|
||||
private byte[] unicodeVersion;
|
||||
|
||||
/**
|
||||
* File format version that this class understands.
|
||||
* No guarantees are made if a older version is used
|
||||
* see store.c of gennorm for more information and values
|
||||
*/
|
||||
private static final byte DATA_FORMAT_ID[] = {(byte)0x4E, (byte)0x6F,
|
||||
(byte)0x72, (byte)0x6D};
|
||||
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x2, (byte)0x2,
|
||||
(byte)0x5, (byte)0x2};
|
||||
|
||||
}
|
|
@ -1,384 +0,0 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.MissingResourceException;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.RangeValueIterator;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
/**
|
||||
* @version 1.0
|
||||
* @author Ram Viswanadha
|
||||
*/
|
||||
public final class NormalizerImpl {
|
||||
// Static block for the class to initialize its own self
|
||||
static final NormalizerImpl IMPL;
|
||||
|
||||
static
|
||||
{
|
||||
try
|
||||
{
|
||||
IMPL = new NormalizerImpl();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
throw new MissingResourceException(e.getMessage(), "", "");
|
||||
}
|
||||
}
|
||||
|
||||
static final int UNSIGNED_BYTE_MASK =0xFF;
|
||||
static final long UNSIGNED_INT_MASK = 0xffffffffL;
|
||||
/*
|
||||
* This new implementation of the normalization code loads its data from
|
||||
* unorm.icu, which is generated with the gennorm tool.
|
||||
* The format of that file is described at the end of this file.
|
||||
*/
|
||||
private static final String DATA_FILE_NAME = ICUResourceBundle.ICU_BUNDLE+"/unorm.icu";
|
||||
|
||||
/* indexes[] value names */
|
||||
/* number of bytes in normalization trie */
|
||||
static final int INDEX_TRIE_SIZE = 0;
|
||||
/* number of chars in extra data */
|
||||
static final int INDEX_CHAR_COUNT = 1;
|
||||
/* number of uint16_t words for combining data */
|
||||
static final int INDEX_COMBINE_DATA_COUNT = 2;
|
||||
/* number of code points that combine forward */
|
||||
static final int INDEX_COMBINE_FWD_COUNT = 3;
|
||||
/* number of code points that combine forward and backward */
|
||||
static final int INDEX_COMBINE_BOTH_COUNT = 4;
|
||||
/* number of code points that combine backward */
|
||||
static final int INDEX_COMBINE_BACK_COUNT = 5;
|
||||
/* first code point with quick check NFC NO/MAYBE */
|
||||
public static final int INDEX_MIN_NFC_NO_MAYBE = 6;
|
||||
/* first code point with quick check NFKC NO/MAYBE */
|
||||
public static final int INDEX_MIN_NFKC_NO_MAYBE = 7;
|
||||
/* first code point with quick check NFD NO/MAYBE */
|
||||
public static final int INDEX_MIN_NFD_NO_MAYBE = 8;
|
||||
/* first code point with quick check NFKD NO/MAYBE */
|
||||
public static final int INDEX_MIN_NFKD_NO_MAYBE = 9;
|
||||
/* number of bytes in FCD trie */
|
||||
static final int INDEX_FCD_TRIE_SIZE = 10;
|
||||
/* number of bytes in the auxiliary trie */
|
||||
static final int INDEX_AUX_TRIE_SIZE = 11;
|
||||
/* number of uint16_t in the array of serialized USet */
|
||||
static final int INDEX_CANON_SET_COUNT = 12;
|
||||
/* changing this requires a new formatVersion */
|
||||
static final int INDEX_TOP = 32;
|
||||
|
||||
|
||||
/* AUX constants */
|
||||
/* value constants for auxTrie */
|
||||
private static final int AUX_UNSAFE_SHIFT = 11;
|
||||
private static final int AUX_COMP_EX_SHIFT = 10;
|
||||
|
||||
private static final int AUX_MAX_FNC = 1<<AUX_COMP_EX_SHIFT;
|
||||
private static final int AUX_UNSAFE_MASK = (int)((1<<AUX_UNSAFE_SHIFT) & UNSIGNED_INT_MASK);
|
||||
private static final int AUX_FNC_MASK = (int)((AUX_MAX_FNC-1) & UNSIGNED_INT_MASK);
|
||||
|
||||
/* canonStartSets[0..31] contains indexes for what is in the array */
|
||||
/* number of uint16_t in canonical starter sets */
|
||||
static final int SET_INDEX_CANON_SETS_LENGTH = 0;
|
||||
/* number of uint16_t in the BMP search table (contains pairs) */
|
||||
static final int SET_INDEX_CANON_BMP_TABLE_LENGTH = 1;
|
||||
/* number of uint16_t in the supplementary search table(contains triplets)*/
|
||||
static final int SET_INDEX_CANON_SUPP_TABLE_LENGTH = 2;
|
||||
/* changing this requires a new formatVersion */
|
||||
static final int SET_INDEX_TOP = 32;
|
||||
|
||||
static final int CANON_SET_INDICIES_INDEX = 0;
|
||||
static final int CANON_SET_START_SETS_INDEX = 1;
|
||||
static final int CANON_SET_BMP_TABLE_INDEX = 2;
|
||||
static final int CANON_SET_SUPP_TABLE_INDEX = 3;
|
||||
/* 14 bit indexes to canonical USerializedSets */
|
||||
static final int CANON_SET_MAX_CANON_SETS = 0x4000;
|
||||
/* single-code point BMP sets are encoded directly in the search table
|
||||
* except if result=0x4000..0x7fff
|
||||
*/
|
||||
static final int CANON_SET_BMP_MASK = 0xc000;
|
||||
static final int CANON_SET_BMP_IS_INDEX = 0x4000;
|
||||
|
||||
/**
|
||||
* Internal option for cmpEquivFold() for decomposing.
|
||||
* If not set, just do strcasecmp().
|
||||
* @internal
|
||||
*/
|
||||
public static final int COMPARE_EQUIV = 0x80000;
|
||||
|
||||
/*******************************/
|
||||
|
||||
/* Wrappers for Trie implementations */
|
||||
static final class AuxTrieImpl implements Trie.DataManipulate{
|
||||
static CharTrie auxTrie = null;
|
||||
/**
|
||||
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
|
||||
* data the index array offset of the indexes for that lead surrogate.
|
||||
* @param value data value for a surrogate from the trie, including
|
||||
* the folding offset
|
||||
* @return data offset or 0 if there is no data for the lead surrogate
|
||||
*/
|
||||
/* auxTrie: the folding offset is in bits 9..0 of the 16-bit trie result */
|
||||
public int getFoldingOffset(int value) {
|
||||
return (value & AUX_FNC_MASK) << SURROGATE_BLOCK_BITS;
|
||||
}
|
||||
}
|
||||
|
||||
/****************************************************/
|
||||
|
||||
private static AuxTrieImpl auxTrieImpl;
|
||||
private static int[] indexes;
|
||||
private static char[] combiningTable;
|
||||
private static char[] extraData;
|
||||
private static Object[] canonStartSets;
|
||||
|
||||
private static boolean isDataLoaded;
|
||||
private static boolean isFormatVersion_2_1;
|
||||
private static byte[] unicodeVersion;
|
||||
|
||||
/**
|
||||
* Default buffer size of datafile
|
||||
*/
|
||||
private static final int DATA_BUFFER_SIZE = 25000;
|
||||
|
||||
/**
|
||||
* FCD check: everything below this code point is known to have a 0
|
||||
* lead combining class
|
||||
*/
|
||||
public static final int MIN_WITH_LEAD_CC=0x300;
|
||||
|
||||
/** Number of bits of a trail surrogate that are used in index table
|
||||
* lookups.
|
||||
*/
|
||||
private static final int SURROGATE_BLOCK_BITS=10-Trie.INDEX_STAGE_1_SHIFT_;
|
||||
|
||||
|
||||
// protected constructor ---------------------------------------------
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
* @exception thrown when data reading fails or data corrupted
|
||||
*/
|
||||
private NormalizerImpl() throws IOException {
|
||||
//data should be loaded only once
|
||||
if(!isDataLoaded){
|
||||
|
||||
// jar access
|
||||
InputStream i = ICUData.getRequiredStream(DATA_FILE_NAME);
|
||||
BufferedInputStream b = new BufferedInputStream(i,DATA_BUFFER_SIZE);
|
||||
NormalizerDataReader reader = new NormalizerDataReader(b);
|
||||
|
||||
// read the indexes
|
||||
indexes = reader.readIndexes(NormalizerImpl.INDEX_TOP);
|
||||
|
||||
byte[] normBytes = new byte[indexes[NormalizerImpl.INDEX_TRIE_SIZE]];
|
||||
|
||||
int combiningTableTop = indexes[NormalizerImpl.INDEX_COMBINE_DATA_COUNT];
|
||||
combiningTable = new char[combiningTableTop];
|
||||
|
||||
int extraDataTop = indexes[NormalizerImpl.INDEX_CHAR_COUNT];
|
||||
extraData = new char[extraDataTop];
|
||||
|
||||
byte[] fcdBytes = new byte[indexes[NormalizerImpl.INDEX_FCD_TRIE_SIZE]];
|
||||
byte[] auxBytes = new byte[indexes[NormalizerImpl.INDEX_AUX_TRIE_SIZE]];
|
||||
canonStartSets=new Object[NormalizerImpl.CANON_SET_MAX_CANON_SETS];
|
||||
|
||||
auxTrieImpl = new AuxTrieImpl();
|
||||
|
||||
// load the rest of the data data and initialize the data members
|
||||
reader.read(normBytes, fcdBytes,auxBytes, extraData, combiningTable,
|
||||
canonStartSets);
|
||||
|
||||
AuxTrieImpl.auxTrie = new CharTrie( new ByteArrayInputStream(auxBytes),auxTrieImpl );
|
||||
|
||||
// we reached here without any exceptions so the data is fully
|
||||
// loaded set the variable to true
|
||||
isDataLoaded = true;
|
||||
|
||||
// get the data format version
|
||||
byte[] formatVersion = reader.getDataFormatVersion();
|
||||
|
||||
isFormatVersion_2_1 =( formatVersion[0]>2
|
||||
||
|
||||
(formatVersion[0]==2 && formatVersion[1]>=1)
|
||||
);
|
||||
unicodeVersion = reader.getUnicodeVersion();
|
||||
b.close();
|
||||
}
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------- */
|
||||
|
||||
/* Korean Hangul and Jamo constants */
|
||||
|
||||
public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */
|
||||
public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */
|
||||
public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */
|
||||
|
||||
public static final int HANGUL_BASE=0xac00;
|
||||
|
||||
public static final int JAMO_L_COUNT=19;
|
||||
public static final int JAMO_V_COUNT=21;
|
||||
public static final int JAMO_T_COUNT=28;
|
||||
public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT;
|
||||
|
||||
/* data access primitives ----------------------------------------------- */
|
||||
|
||||
public static VersionInfo getUnicodeVersion(){
|
||||
return VersionInfo.getInstance(unicodeVersion[0], unicodeVersion[1],
|
||||
unicodeVersion[2], unicodeVersion[3]);
|
||||
}
|
||||
|
||||
public static boolean isCanonSafeStart(int c) {
|
||||
if(isFormatVersion_2_1) {
|
||||
int aux = AuxTrieImpl.auxTrie.getCodePointValue(c);
|
||||
return (aux & AUX_UNSAFE_MASK) == 0;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean getCanonStartSet(int c, USerializedSet fillSet) {
|
||||
|
||||
if(fillSet!=null && canonStartSets!=null) {
|
||||
/*
|
||||
* binary search for c
|
||||
*
|
||||
* There are two search tables,
|
||||
* one for BMP code points and one for supplementary ones.
|
||||
* See unormimp.h for details.
|
||||
*/
|
||||
char[] table;
|
||||
int i=0, start, limit;
|
||||
|
||||
int[] idxs = (int[]) canonStartSets[CANON_SET_INDICIES_INDEX];
|
||||
char[] startSets = (char[]) canonStartSets[CANON_SET_START_SETS_INDEX];
|
||||
|
||||
if(c<=0xffff) {
|
||||
table=(char[]) canonStartSets[CANON_SET_BMP_TABLE_INDEX];
|
||||
start=0;
|
||||
limit=table.length;
|
||||
|
||||
/* each entry is a pair { c, result } */
|
||||
while(start<limit-2) {
|
||||
i=(char)(((start+limit)/4)*2);
|
||||
if(c<table[i]) {
|
||||
limit=i;
|
||||
} else {
|
||||
start=i;
|
||||
}
|
||||
}
|
||||
//System.out.println(i);
|
||||
/* found? */
|
||||
if(c==table[start]) {
|
||||
i=table[start+1];
|
||||
if((i & CANON_SET_BMP_MASK)==CANON_SET_BMP_IS_INDEX) {
|
||||
/* result 01xxxxxx xxxxxx contains index x to a
|
||||
* USerializedSet */
|
||||
i&=(CANON_SET_MAX_CANON_SETS-1);
|
||||
return fillSet.getSet(startSets,(i-idxs.length));
|
||||
} else {
|
||||
/* other result values are BMP code points for
|
||||
* single-code point sets */
|
||||
fillSet.setToOne(i);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
char high, low, h,j=0;
|
||||
|
||||
table=(char[]) canonStartSets[CANON_SET_SUPP_TABLE_INDEX];
|
||||
start=0;
|
||||
limit=table.length;
|
||||
|
||||
high=(char)(c>>16);
|
||||
low=(char)c;
|
||||
|
||||
/* each entry is a triplet { high(c), low(c), result } */
|
||||
while(start<limit-3) {
|
||||
/* (start+limit)/2 and address triplets */
|
||||
i=(char)(((start+limit)/6)*3);
|
||||
j=(char)(table[i]&0x1f); /* high word */
|
||||
int tableVal = table[i+1];
|
||||
int lowInt = low;
|
||||
if(high<j || ((tableVal>lowInt) && (high==j))) {
|
||||
limit=i;
|
||||
} else {
|
||||
start=i;
|
||||
}
|
||||
|
||||
//System.err.println("\t((high==j) && (table[i+1]>low)) == " + ((high==j) && (tableVal>lowInt)) );
|
||||
|
||||
// KLUDGE: IBM JIT in 1.4.0 is sooo broken
|
||||
// The below lines make TestExhaustive pass
|
||||
if(ICUDebug.enabled()){
|
||||
System.err.println("\t\t j = " + Utility.hex(j,4) +
|
||||
"\t i = " + Utility.hex(i,4) +
|
||||
"\t high = "+ Utility.hex(high) +
|
||||
"\t low = " + Utility.hex(lowInt,4) +
|
||||
"\t table[i+1]: "+ Utility.hex(tableVal,4)
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* found? */
|
||||
h=table[start];
|
||||
|
||||
//System.err.println("c: \\U"+ Integer.toHexString(c)+" i : "+Integer.toHexString(i) +" h : " + Integer.toHexString(h));
|
||||
int tableVal1 = table[start+1];
|
||||
int lowInt = low;
|
||||
|
||||
if(high==(h&0x1f) && lowInt==tableVal1) {
|
||||
int tableVal2 = table[start+2];
|
||||
i=tableVal2;
|
||||
if((h&0x8000)==0) {
|
||||
/* the result is an index to a USerializedSet */
|
||||
return fillSet.getSet(startSets,(i-idxs.length));
|
||||
} else {
|
||||
/*
|
||||
* single-code point set {x} in
|
||||
* triplet { 100xxxxx 000hhhhh llllllll llllllll xxxxxxxx xxxxxxxx }
|
||||
*/
|
||||
//i|=((int)h & 0x1f00)<<8; /* add high bits from high(c) */
|
||||
int temp = ((int)h & 0x1f00)<<8;
|
||||
i|=temp; /* add high bits from high(c) */
|
||||
fillSet.setToOne(i);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false; /* not found */
|
||||
}
|
||||
|
||||
public static UnicodeSet addPropertyStarts(UnicodeSet set) {
|
||||
int c;
|
||||
|
||||
/* add the start code point of each same-value range of each trie */
|
||||
if(isFormatVersion_2_1){
|
||||
//utrie_enum(&auxTrie, NULL, _enumPropertyStartsRange, set);
|
||||
TrieIterator auxIter = new TrieIterator(AuxTrieImpl.auxTrie);
|
||||
RangeValueIterator.Element auxResult = new RangeValueIterator.Element();
|
||||
while(auxIter.next(auxResult)){
|
||||
set.add(auxResult.start);
|
||||
}
|
||||
}
|
||||
/* add Hangul LV syllables and LV+1 because of skippables */
|
||||
for(c=HANGUL_BASE; c<HANGUL_BASE+HANGUL_COUNT; c+=JAMO_T_COUNT) {
|
||||
set.add(c);
|
||||
set.add(c+1);
|
||||
}
|
||||
set.add(HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
|
||||
return set; // for chaining
|
||||
}
|
||||
}
|
|
@ -103,24 +103,22 @@ public final class UCharacterProperty
|
|||
public static final int SRC_PROPSVEC=2;
|
||||
/** From unames.c/unames.icu */
|
||||
public static final int SRC_NAMES=3;
|
||||
/** From unorm.cpp/unorm.icu */
|
||||
public static final int SRC_NORM=4;
|
||||
/** From ucase.c/ucase.icu */
|
||||
public static final int SRC_CASE=5;
|
||||
public static final int SRC_CASE=4;
|
||||
/** From ubidi_props.c/ubidi.icu */
|
||||
public static final int SRC_BIDI=6;
|
||||
public static final int SRC_BIDI=5;
|
||||
/** From uchar.c/uprops.icu main trie as well as properties vectors trie */
|
||||
public static final int SRC_CHAR_AND_PROPSVEC=7;
|
||||
public static final int SRC_CHAR_AND_PROPSVEC=6;
|
||||
/** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
|
||||
public static final int SRC_CASE_AND_NORM=8;
|
||||
public static final int SRC_CASE_AND_NORM=7;
|
||||
/** From normalizer2impl.cpp/nfc.nrm */
|
||||
public static final int SRC_NFC=9;
|
||||
public static final int SRC_NFC=8;
|
||||
/** From normalizer2impl.cpp/nfkc.nrm */
|
||||
public static final int SRC_NFKC=10;
|
||||
public static final int SRC_NFKC=9;
|
||||
/** From normalizer2impl.cpp/nfkc_cf.nrm */
|
||||
public static final int SRC_NFKC_CF=11;
|
||||
public static final int SRC_NFKC_CF=10;
|
||||
/** One more than the highest UPropertySource (SRC_) constant. */
|
||||
public static final int SRC_COUNT=12;
|
||||
public static final int SRC_COUNT=11;
|
||||
|
||||
// public methods ----------------------------------------------------
|
||||
|
||||
|
@ -310,7 +308,7 @@ public final class UCharacterProperty
|
|||
new BinaryProperties( SRC_NFKC, 0 ), /* UCHAR_NFKD_INERT */
|
||||
new BinaryProperties( SRC_NFC, 0 ), /* UCHAR_NFC_INERT */
|
||||
new BinaryProperties( SRC_NFKC, 0 ), /* UCHAR_NFKC_INERT */
|
||||
new BinaryProperties( SRC_NORM, 0 ), /* UCHAR_SEGMENT_STARTER */
|
||||
new BinaryProperties( SRC_NFC, 0 ), /* UCHAR_SEGMENT_STARTER */
|
||||
new BinaryProperties( 1, ( 1 << PATTERN_SYNTAX) ),
|
||||
new BinaryProperties( 1, ( 1 << PATTERN_WHITE_SPACE) ),
|
||||
new BinaryProperties( SRC_CHAR_AND_PROPSVEC, 0 ), /* UCHAR_POSIX_ALNUM */
|
||||
|
@ -372,25 +370,25 @@ public final class UCharacterProperty
|
|||
} catch (IOException e) {
|
||||
return false;
|
||||
}
|
||||
} else if(column==SRC_NORM) {
|
||||
/* normalization properties from unorm.icu */
|
||||
switch(which) {
|
||||
case UProperty.SEGMENT_STARTER:
|
||||
return NormalizerImpl.isCanonSafeStart(c);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else if(column==SRC_NFC || column==SRC_NFKC) {
|
||||
} else if(column==SRC_NFC) {
|
||||
/* normalization properties from nfc.nrm */
|
||||
switch(which) {
|
||||
case UProperty.FULL_COMPOSITION_EXCLUSION: {
|
||||
// By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
|
||||
Normalizer2Impl impl=Norm2AllModes.getNFCInstanceNoIOException().impl;
|
||||
return impl.isCompNo(impl.getNorm16(c));
|
||||
}
|
||||
case UProperty.SEGMENT_STARTER:
|
||||
return Norm2AllModes.getNFCInstanceNoIOException().impl.
|
||||
ensureCanonIterData().isCanonSegmentStarter(c);
|
||||
default:
|
||||
// UCHAR_NF..._INERT properties
|
||||
// UCHAR_NF[CD]_INERT properties
|
||||
return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
|
||||
}
|
||||
} else if(column==SRC_NFKC) {
|
||||
/* normalization properties from nfkc.nrm */
|
||||
// UCHAR_NFK[CD]_INERT properties
|
||||
return Norm2AllModes.getN2WithImpl(which-UProperty.NFD_INERT).isInert(c);
|
||||
} else if(column==SRC_NFKC_CF) {
|
||||
// currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
|
||||
Normalizer2Impl kcf=Norm2AllModes.getNFKC_CFInstanceNoIOException().impl;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2002-2009, International Business Machines
|
||||
* Copyright (C) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -112,28 +112,26 @@ public final class USerializedSet {
|
|||
if(rangeIndex<bmpLength) {
|
||||
range[0]=array[rangeIndex++];
|
||||
if(rangeIndex<bmpLength) {
|
||||
range[1]=array[rangeIndex];
|
||||
range[1]=array[rangeIndex]-1;
|
||||
} else if(rangeIndex<length) {
|
||||
range[1]=(((int)array[rangeIndex])<<16)|array[rangeIndex+1];
|
||||
range[1]=((((int)array[rangeIndex])<<16)|array[rangeIndex+1])-1;
|
||||
} else {
|
||||
range[1]=0x110000;
|
||||
range[1]=0x10ffff;
|
||||
}
|
||||
range[1]-=1;
|
||||
return true;
|
||||
} else {
|
||||
rangeIndex-=bmpLength;
|
||||
rangeIndex*=2; /* address pairs of pairs of units */
|
||||
length-=bmpLength;
|
||||
if(rangeIndex<length) {
|
||||
int suppLength=length-bmpLength;
|
||||
if(rangeIndex<suppLength) {
|
||||
int offset=arrayOffset+bmpLength;
|
||||
range[0]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
|
||||
rangeIndex+=2;
|
||||
if(rangeIndex<length) {
|
||||
range[1]=(((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1];
|
||||
if(rangeIndex<suppLength) {
|
||||
range[1]=((((int)array[offset+rangeIndex])<<16)|array[offset+rangeIndex+1])-1;
|
||||
} else {
|
||||
range[1]=0x110000;
|
||||
range[1]=0x10ffff;
|
||||
}
|
||||
range[1]-=1;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
|
|
@ -12,8 +12,8 @@ import java.util.Iterator;
|
|||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.USerializedSet;
|
||||
import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.Normalizer2Impl;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
||||
|
@ -48,6 +48,9 @@ public final class CanonicalIterator {
|
|||
* @stable ICU 2.4
|
||||
*/
|
||||
public CanonicalIterator(String source) {
|
||||
Norm2AllModes allModes = Norm2AllModes.getNFCInstanceNoIOException();
|
||||
nfd = allModes.decomp;
|
||||
nfcImpl = allModes.impl.ensureCanonIterData();
|
||||
setSource(source);
|
||||
}
|
||||
|
||||
|
@ -110,7 +113,7 @@ public final class CanonicalIterator {
|
|||
* @stable ICU 2.4
|
||||
*/
|
||||
public void setSource(String newSource) {
|
||||
source = Normalizer.normalize(newSource, Normalizer.NFD);
|
||||
source = nfd.normalize(newSource);
|
||||
done = false;
|
||||
|
||||
// catch degenerate case
|
||||
|
@ -131,9 +134,9 @@ public final class CanonicalIterator {
|
|||
|
||||
int i = UTF16.findOffsetFromCodePoint(source, 1);
|
||||
|
||||
for (; i < source.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(source, i);
|
||||
if (NormalizerImpl.isCanonSafeStart(cp)) {
|
||||
for (; i < source.length(); i += Character.charCount(cp)) {
|
||||
cp = source.codePointAt(i);
|
||||
if (nfcImpl.isCanonSegmentStarter(cp)) {
|
||||
segmentList.add(source.substring(start, i)); // add up to i
|
||||
start = i;
|
||||
}
|
||||
|
@ -226,6 +229,8 @@ public final class CanonicalIterator {
|
|||
private static boolean SKIP_ZEROS = true;
|
||||
|
||||
// fields
|
||||
private final Normalizer2 nfd;
|
||||
private final Normalizer2Impl nfcImpl;
|
||||
private String source;
|
||||
private boolean done;
|
||||
private String[][] pieces;
|
||||
|
@ -286,37 +291,30 @@ public final class CanonicalIterator {
|
|||
|
||||
result.add(segment);
|
||||
StringBuffer workingBuffer = new StringBuffer();
|
||||
UnicodeSet starts = new UnicodeSet();
|
||||
|
||||
// cycle through all the characters
|
||||
int cp=0;
|
||||
int[] range = new int[2];
|
||||
for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) {
|
||||
int cp;
|
||||
for (int i = 0; i < segment.length(); i += Character.charCount(cp)) {
|
||||
|
||||
// see if any character is at the start of some decomposition
|
||||
cp = UTF16.charAt(segment, i);
|
||||
USerializedSet starts = new USerializedSet();
|
||||
|
||||
if (!NormalizerImpl.getCanonStartSet(cp, starts)) {
|
||||
cp = segment.codePointAt(i);
|
||||
if (!nfcImpl.getCanonStartSet(cp, starts)) {
|
||||
continue;
|
||||
}
|
||||
int j=0;
|
||||
// if so, see which decompositions match
|
||||
int rangeCount = starts.countRanges();
|
||||
for(j = 0; j < rangeCount; ++j) {
|
||||
starts.getRange(j, range);
|
||||
int end=range[1];
|
||||
for (int cp2 = range[0]; cp2 <= end; ++cp2) {
|
||||
Set<String> remainder = extract(cp2, segment, i, workingBuffer);
|
||||
if (remainder == null) {
|
||||
continue;
|
||||
}
|
||||
for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) {
|
||||
int cp2 = iter.codepoint;
|
||||
Set<String> remainder = extract(cp2, segment, i, workingBuffer);
|
||||
if (remainder == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// there were some matches, so add all the possibilities to the set.
|
||||
String prefix= segment.substring(0,i);
|
||||
prefix += UTF16.valueOf(cp2);
|
||||
for (String item : remainder) {
|
||||
result.add(prefix + item);
|
||||
}
|
||||
// there were some matches, so add all the possibilities to the set.
|
||||
String prefix= segment.substring(0,i);
|
||||
prefix += UTF16.valueOf(cp2);
|
||||
for (String item : remainder) {
|
||||
result.add(prefix + item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -368,8 +366,10 @@ public final class CanonicalIterator {
|
|||
if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp))
|
||||
+ ", " + Utility.hex(segment.substring(segmentPos)));
|
||||
|
||||
//String decomp = Normalizer.normalize(UTF16.valueOf(comp), Normalizer.DECOMP, 0);
|
||||
String decomp = Normalizer.normalize(comp, Normalizer.NFD);
|
||||
String decomp = nfcImpl.getDecomposition(comp);
|
||||
if (decomp == null) {
|
||||
decomp = UTF16.valueOf(comp);
|
||||
}
|
||||
|
||||
// See if it matches the start of segment (at segmentPos)
|
||||
boolean ok = false;
|
||||
|
|
|
@ -6,11 +6,9 @@
|
|||
*/
|
||||
package com.ibm.icu.text;
|
||||
import com.ibm.icu.impl.Normalizer2Impl;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.UCaseProps;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.CharBuffer;
|
||||
|
@ -2403,14 +2401,6 @@ public final class Normalizer implements Cloneable {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches the Unicode version burned into the Normalization data file
|
||||
* @return VersionInfo version information of the normalizer
|
||||
*/
|
||||
static VersionInfo getUnicodeVersion() {
|
||||
return NormalizerImpl.getUnicodeVersion();
|
||||
}
|
||||
|
||||
/**
|
||||
* An Appendable that writes into a char array with a capacity that may be
|
||||
* less than array.length.
|
||||
|
|
|
@ -17,6 +17,7 @@ import com.ibm.icu.impl.ICUData;
|
|||
import com.ibm.icu.impl.ICUResourceBundle;
|
||||
import com.ibm.icu.impl.StringPrepDataReader;
|
||||
import com.ibm.icu.impl.UBiDiProps;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterDirection;
|
||||
import com.ibm.icu.util.VersionInfo;
|
||||
|
||||
|
@ -297,7 +298,7 @@ public final class StringPrep {
|
|||
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
|
||||
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
|
||||
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
|
||||
VersionInfo normUniVer = Normalizer.getUnicodeVersion();
|
||||
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
|
||||
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
|
||||
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
|
||||
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
|
||||
|
|
|
@ -17,7 +17,6 @@ import java.util.TreeSet;
|
|||
|
||||
import com.ibm.icu.impl.BMPSet;
|
||||
import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.RuleCharacterIterator;
|
||||
import com.ibm.icu.impl.SortedSetRelation;
|
||||
import com.ibm.icu.impl.UBiDiProps;
|
||||
|
@ -3092,11 +3091,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
|||
UCharacterProperty.INSTANCE.addPropertyStarts(incl);
|
||||
UCharacterProperty.INSTANCE.upropsvec_addPropertyStarts(incl);
|
||||
break;
|
||||
case UCharacterProperty.SRC_NORM:
|
||||
NormalizerImpl.addPropertyStarts(incl);
|
||||
break;
|
||||
case UCharacterProperty.SRC_CASE_AND_NORM:
|
||||
NormalizerImpl.addPropertyStarts(incl);
|
||||
Norm2AllModes.getNFCInstanceNoIOException().impl.addPropertyStarts(incl);
|
||||
UCaseProps.getSingleton().addPropertyStarts(incl);
|
||||
break;
|
||||
case UCharacterProperty.SRC_NFC:
|
||||
|
|
|
@ -14,12 +14,12 @@ import java.util.Locale;
|
|||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.dev.test.TestUtil;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.Normalizer2Impl;
|
||||
import com.ibm.icu.impl.UBiDiProps;
|
||||
import com.ibm.icu.impl.UCaseProps;
|
||||
import com.ibm.icu.impl.UCharacterName;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
import com.ibm.icu.impl.USerializedSet;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
|
@ -2204,19 +2204,6 @@ public final class UCharacterTest extends TestFmwk
|
|||
}
|
||||
}
|
||||
|
||||
/* add characters from a serialized set to a normal one */
|
||||
private static void _setAddSerialized(UnicodeSet set, USerializedSet sset) {
|
||||
// int start, end;
|
||||
int i, count;
|
||||
|
||||
count=sset.countRanges();
|
||||
int[] range = new int[2];
|
||||
for(i=0; i<count; ++i) {
|
||||
sset.getRange(i,range);
|
||||
set.add(range[0],range[1]);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean showADiffB(UnicodeSet a, UnicodeSet b,
|
||||
String a_name, String b_name,
|
||||
boolean expect,
|
||||
|
@ -2284,7 +2271,6 @@ public final class UCharacterTest extends TestFmwk
|
|||
public void TestConsistency() throws IOException {
|
||||
UnicodeSet set1, set2, set3, set4;
|
||||
|
||||
USerializedSet sset;
|
||||
int start, end;
|
||||
int i, length;
|
||||
|
||||
|
@ -2368,10 +2354,9 @@ public final class UCharacterTest extends TestFmwk
|
|||
*/
|
||||
Normalizer2 norm2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
|
||||
set1=new UnicodeSet();
|
||||
Norm2AllModes.getNFCInstanceNoIOException().impl.
|
||||
ensureCanonIterData().getCanonStartSet(0x49, set1);
|
||||
set2=new UnicodeSet();
|
||||
sset = new USerializedSet();
|
||||
NormalizerImpl.getCanonStartSet(0x49,sset);
|
||||
_setAddSerialized(set1, sset);
|
||||
|
||||
/* enumerate all characters that are plausible to be latin letters */
|
||||
for(start=0xa0; start<0x2000; ++start) {
|
||||
|
@ -2869,7 +2854,7 @@ public final class UCharacterTest extends TestFmwk
|
|||
}
|
||||
|
||||
// Testing when "if(ch<NormalizerImpl.JAMO_L_BASE)" is true
|
||||
for(int i=NormalizerImpl.JAMO_L_BASE-5; i<NormalizerImpl.JAMO_L_BASE; i++){
|
||||
for(int i=Normalizer2Impl.Hangul.JAMO_L_BASE-5; i<Normalizer2Impl.Hangul.JAMO_L_BASE; i++){
|
||||
if(UCharacter.getIntPropertyValue(i, UProperty.HANGUL_SYLLABLE_TYPE) != 0){
|
||||
errln("UCharacter.getIntPropertyValue(ch, type) was suppose to return 0 " +
|
||||
"when passing ch: " + i + "and type of Property.HANGUL_SYLLABLE_TYPE");
|
||||
|
@ -2878,7 +2863,7 @@ public final class UCharacterTest extends TestFmwk
|
|||
}
|
||||
|
||||
// Testing when "else if((ch-=NormalizerImpl.HANGUL_BASE)<0)" is true
|
||||
for(int i=NormalizerImpl.HANGUL_BASE-5; i<NormalizerImpl.HANGUL_BASE; i++){
|
||||
for(int i=Normalizer2Impl.Hangul.HANGUL_BASE-5; i<Normalizer2Impl.Hangul.HANGUL_BASE; i++){
|
||||
if(UCharacter.getIntPropertyValue(i, UProperty.HANGUL_SYLLABLE_TYPE) != 0){
|
||||
errln("UCharacter.getIntPropertyValue(ch, type) was suppose to return 0 " +
|
||||
"when passing ch: " + i + "and type of Property.HANGUL_SYLLABLE_TYPE");
|
||||
|
|
|
@ -11,12 +11,11 @@ import java.text.StringCharacterIterator;
|
|||
import java.util.Random;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.Normalizer2Impl;
|
||||
import com.ibm.icu.impl.USerializedSet;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.*;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.text.UCharacterIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
|
@ -1998,21 +1997,17 @@ public class BasicTest extends TestFmwk {
|
|||
|
||||
// test cases with i and I to make sure Turkic works
|
||||
char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
|
||||
USerializedSet sset=new USerializedSet();
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
|
||||
UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet();
|
||||
Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstanceNoIOException().impl;
|
||||
nfcImpl.ensureCanonIterData();
|
||||
|
||||
String s1, s2;
|
||||
int start, end;
|
||||
|
||||
// collect all sets into one for contiguous output
|
||||
int[] startEnd = new int[2];
|
||||
for(i=0; i<iI.length; ++i) {
|
||||
if(NormalizerImpl.getCanonStartSet(iI[i], sset)) {
|
||||
count=sset.countRanges();
|
||||
for(j=0; j<count; ++j) {
|
||||
sset.getRange(j, startEnd);
|
||||
set.add(startEnd[0], startEnd[1]);
|
||||
}
|
||||
if(nfcImpl.getCanonStartSet(iI[i], iSet)) {
|
||||
set.addAll(iSet);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2771,20 +2766,24 @@ public class BasicTest extends TestFmwk {
|
|||
USerializedSet sset=new USerializedSet();
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
int start, end;
|
||||
|
||||
|
||||
char[] serialized = {
|
||||
0x8007, // length
|
||||
3, // bmpLength
|
||||
0xc0, 0xfe, 0xfffc,
|
||||
1, 9, 0x10, 0xfffc
|
||||
};
|
||||
sset.getSet(serialized, 0);
|
||||
|
||||
// collect all sets into one for contiguous output
|
||||
int[] startEnd = new int[2];
|
||||
|
||||
if(NormalizerImpl.getCanonStartSet(0x0130, sset)) {
|
||||
int count=sset.countRanges();
|
||||
for(int j=0; j<count; ++j) {
|
||||
sset.getRange(j, startEnd);
|
||||
set.add(startEnd[0], startEnd[1]);
|
||||
}
|
||||
int count=sset.countRanges();
|
||||
for(int j=0; j<count; ++j) {
|
||||
sset.getRange(j, startEnd);
|
||||
set.add(startEnd[0], startEnd[1]);
|
||||
}
|
||||
|
||||
|
||||
// test all of these precomposed characters
|
||||
// test all of these characters
|
||||
UnicodeSetIterator it = new UnicodeSetIterator(set);
|
||||
while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
|
||||
start=it.codepoint;
|
||||
|
@ -2793,10 +2792,11 @@ public class BasicTest extends TestFmwk {
|
|||
if(!sset.contains(start)){
|
||||
errln("USerializedSet.contains failed for "+Utility.hex(start,8));
|
||||
}
|
||||
++start;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void TestReturnFailure(){
|
||||
char[] term = {'r','\u00e9','s','u','m','\u00e9' };
|
||||
char[] decomposed_term = new char[10 + term.length + 2];
|
||||
|
|
|
@ -12,9 +12,6 @@ import java.util.SortedSet;
|
|||
import java.util.TreeSet;
|
||||
import java.util.Set;
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.Normalizer2Impl;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
|
@ -41,17 +38,6 @@ public class TestCanonicalIterator extends TestFmwk {
|
|||
{"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
|
||||
};
|
||||
|
||||
public void TestOldAndNew() {
|
||||
Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstanceNoIOException().impl;
|
||||
nfcImpl.ensureCanonIterData();
|
||||
for (int c = 0; c <= 0x10ffff; ++c) {
|
||||
if (nfcImpl.isCanonSegmentStarter(c) != NormalizerImpl.isCanonSafeStart(c)) {
|
||||
errln(String.format("old!=new segment starter for U+%04x: old %b new %b",
|
||||
c, NormalizerImpl.isCanonSafeStart(c), nfcImpl.isCanonSegmentStarter(c)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestExhaustive() {
|
||||
int counter = 0;
|
||||
CanonicalIterator it = new CanonicalIterator("");
|
||||
|
|
Loading…
Add table
Reference in a new issue