mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 12:40:02 +00:00
ICU-7273 build data for CanonicalIterator on the fly; test old vs. new isCanonSegmentStarter()
X-SVN-Rev: 27552
This commit is contained in:
parent
784704f4c7
commit
3ea9a0e230
2 changed files with 138 additions and 5 deletions
|
@ -10,6 +10,7 @@ import java.io.BufferedInputStream;
|
|||
import java.io.DataInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
@ -467,7 +468,7 @@ public final class Normalizer2Impl {
|
|||
Trie2Writable newFCDTrie=new Trie2Writable(0, 0);
|
||||
Iterator<Trie2.Range> trieIterator=normTrie.iterator();
|
||||
while(trieIterator.hasNext()) {
|
||||
// Set the FCD value for a range of same-norm16 charcters.
|
||||
// Set the FCD value for a range of same-norm16 characters.
|
||||
Trie2.Range range=trieIterator.next();
|
||||
if(range.value!=0) {
|
||||
setFCD16FromNorm16(range.startCodePoint, range.endCodePoint, range.value, newFCDTrie);
|
||||
|
@ -495,6 +496,89 @@ public final class Normalizer2Impl {
|
|||
return fcdTrie=newFCDTrie.toTrie2_16();
|
||||
}
|
||||
|
||||
public synchronized Normalizer2Impl ensureCanonIterData() {
|
||||
if(canonIterData==null) {
|
||||
Trie2Writable newData=new Trie2Writable(0, 0);
|
||||
canonStartSets=new ArrayList<UnicodeSet>();
|
||||
Iterator<Trie2.Range> trieIterator=normTrie.iterator();
|
||||
while(trieIterator.hasNext()) {
|
||||
Trie2.Range range=trieIterator.next();
|
||||
int norm16=range.value;
|
||||
if(norm16==0) {
|
||||
continue; // inert
|
||||
}
|
||||
if(norm16==minYesNo) {
|
||||
// Hangul LV & LVT: Set has-compositions for all syllables
|
||||
// to minimize the trie size, although only LV syllables
|
||||
// do have compositions. Handle at runtime.
|
||||
// Set the same value for the whole range because
|
||||
// there cannot be other data. Hangul syllables are segment starters,
|
||||
// and since they decompose they cannot have canonStartSets.
|
||||
// (There is no decomposable character in a decomposition mapping.)
|
||||
range.value=CANON_HAS_COMPOSITIONS;
|
||||
newData.setRange(range, true);
|
||||
continue;
|
||||
}
|
||||
for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
|
||||
int oldValue=newData.get(c);
|
||||
int newValue=oldValue;
|
||||
if(norm16>=minMaybeYes) {
|
||||
// not a segment starter if it occurs in a decomposition or has cc!=0
|
||||
newValue|=CANON_NOT_SEGMENT_STARTER;
|
||||
if(norm16<MIN_NORMAL_MAYBE_YES) {
|
||||
newValue|=CANON_HAS_COMPOSITIONS;
|
||||
}
|
||||
} else if(norm16<minYesNo) {
|
||||
newValue|=CANON_HAS_COMPOSITIONS;
|
||||
} else {
|
||||
// c has a decomposition
|
||||
int c2=c;
|
||||
while(limitNoNo<=norm16 && norm16<minMaybeYes) {
|
||||
c2=this.mapAlgorithmic(c2, norm16);
|
||||
norm16=getNorm16(c2);
|
||||
}
|
||||
if(minYesNo<=norm16 && norm16<limitNoNo) {
|
||||
// c decomposes, get everything from the variable-length extra data
|
||||
int firstUnit=extraData.charAt(norm16++);
|
||||
if(c==c2 && (firstUnit&MAPPING_PLUS_COMPOSITION_LIST)!=0) {
|
||||
newValue|=CANON_HAS_COMPOSITIONS; // original c has compositions
|
||||
}
|
||||
int length=firstUnit&MAPPING_LENGTH_MASK;
|
||||
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
|
||||
if(c==c2 && (extraData.charAt(norm16)&0xff)!=0) {
|
||||
newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
|
||||
}
|
||||
++norm16;
|
||||
}
|
||||
if(length!=0) {
|
||||
// add c to first code point's start set
|
||||
int limit=norm16+length;
|
||||
c2=extraData.codePointAt(norm16);
|
||||
addToStartSet(newData, c, c2);
|
||||
// set CANON_NOT_SEGMENT_STARTER for each remaining code point
|
||||
while((norm16+=Character.charCount(c2))<limit) {
|
||||
c2=extraData.codePointAt(norm16);
|
||||
int c2Value=newData.get(c2);
|
||||
if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
|
||||
newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// c decomposed to c2 algorithmically; c has cc==0
|
||||
addToStartSet(newData, c, c2);
|
||||
}
|
||||
}
|
||||
if(newValue!=oldValue) {
|
||||
newData.set(c, newValue);
|
||||
}
|
||||
}
|
||||
}
|
||||
canonIterData=newData.toTrie2_32();
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public int getNorm16(int c) { return normTrie.get(c); }
|
||||
|
||||
public int getCompQuickCheck(int norm16) {
|
||||
|
@ -605,6 +689,10 @@ public final class Normalizer2Impl {
|
|||
}
|
||||
}
|
||||
|
||||
public boolean isCanonSegmentStarter(int c) {
|
||||
return canonIterData.get(c)>=0;
|
||||
}
|
||||
|
||||
public static final int MIN_CCC_LCCC_CP=0x300;
|
||||
|
||||
public static final int MIN_YES_YES_WITH_CC=0xff01;
|
||||
|
@ -1782,6 +1870,30 @@ public final class Normalizer2Impl {
|
|||
return p;
|
||||
}
|
||||
|
||||
private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
|
||||
int canonValue=newData.get(decompLead);
|
||||
if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
|
||||
// origin is the first character whose decomposition starts with
|
||||
// the character for which we are setting the value.
|
||||
newData.set(decompLead, canonValue|origin);
|
||||
} else {
|
||||
// origin is not the first character, or it is U+0000.
|
||||
UnicodeSet set;
|
||||
if((canonValue&CANON_HAS_SET)==0) {
|
||||
int firstOrigin=canonValue&CANON_VALUE_MASK;
|
||||
canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
|
||||
newData.set(decompLead, canonValue);
|
||||
canonStartSets.add(set=new UnicodeSet());
|
||||
if(firstOrigin!=0) {
|
||||
set.add(firstOrigin);
|
||||
}
|
||||
} else {
|
||||
set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
|
||||
}
|
||||
set.add(origin);
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unused")
|
||||
private VersionInfo dataVersion;
|
||||
|
||||
|
@ -1800,4 +1912,12 @@ public final class Normalizer2Impl {
|
|||
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
|
||||
|
||||
private Trie2_16 fcdTrie;
|
||||
private Trie2_32 canonIterData;
|
||||
private ArrayList<UnicodeSet> canonStartSets;
|
||||
|
||||
// bits in canonIterData
|
||||
private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
|
||||
private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
|
||||
private static final int CANON_HAS_SET = 0x200000;
|
||||
private static final int CANON_VALUE_MASK = 0x1fffff;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.normalizer;
|
||||
|
@ -12,6 +12,9 @@ import java.util.SortedSet;
|
|||
import java.util.TreeSet;
|
||||
import java.util.Set;
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.Norm2AllModes;
|
||||
import com.ibm.icu.impl.Normalizer2Impl;
|
||||
import com.ibm.icu.impl.NormalizerImpl;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
|
@ -37,8 +40,18 @@ public class TestCanonicalIterator extends TestFmwk {
|
|||
{"\u010d\u017E", "c\u030Cz\u030C, c\u030C\u017E, \u010Dz\u030C, \u010D\u017E"},
|
||||
{"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
|
||||
};
|
||||
|
||||
|
||||
|
||||
public void TestOldAndNew() {
|
||||
Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstanceNoIOException().impl;
|
||||
nfcImpl.ensureCanonIterData();
|
||||
for (int c = 0; c <= 0x10ffff; ++c) {
|
||||
if (nfcImpl.isCanonSegmentStarter(c) != NormalizerImpl.isCanonSafeStart(c)) {
|
||||
errln(String.format("old!=new segment starter for U+%04x: old %b new %b",
|
||||
c, NormalizerImpl.isCanonSafeStart(c), nfcImpl.isCanonSegmentStarter(c)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestExhaustive() {
|
||||
int counter = 0;
|
||||
CanonicalIterator it = new CanonicalIterator("");
|
||||
|
|
Loading…
Add table
Reference in a new issue