ICU-7273 build data for CanonicalIterator on the fly; test old vs. new isCanonSegmentStarter()

X-SVN-Rev: 27552
This commit is contained in:
Markus Scherer 2010-02-12 06:31:24 +00:00
parent 784704f4c7
commit 3ea9a0e230
2 changed files with 138 additions and 5 deletions

View file

@ -10,6 +10,7 @@ import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import com.ibm.icu.text.UnicodeSet;
@ -467,7 +468,7 @@ public final class Normalizer2Impl {
Trie2Writable newFCDTrie=new Trie2Writable(0, 0);
Iterator<Trie2.Range> trieIterator=normTrie.iterator();
while(trieIterator.hasNext()) {
// Set the FCD value for a range of same-norm16 charcters.
// Set the FCD value for a range of same-norm16 characters.
Trie2.Range range=trieIterator.next();
if(range.value!=0) {
setFCD16FromNorm16(range.startCodePoint, range.endCodePoint, range.value, newFCDTrie);
@ -495,6 +496,89 @@ public final class Normalizer2Impl {
return fcdTrie=newFCDTrie.toTrie2_16();
}
public synchronized Normalizer2Impl ensureCanonIterData() {
if(canonIterData==null) {
Trie2Writable newData=new Trie2Writable(0, 0);
canonStartSets=new ArrayList<UnicodeSet>();
Iterator<Trie2.Range> trieIterator=normTrie.iterator();
while(trieIterator.hasNext()) {
Trie2.Range range=trieIterator.next();
int norm16=range.value;
if(norm16==0) {
continue; // inert
}
if(norm16==minYesNo) {
// Hangul LV & LVT: Set has-compositions for all syllables
// to minimize the trie size, although only LV syllables
// do have compositions. Handle at runtime.
// Set the same value for the whole range because
// there cannot be other data. Hangul syllables are segment starters,
// and since they decompose they cannot have canonStartSets.
// (There is no decomposable character in a decomposition mapping.)
range.value=CANON_HAS_COMPOSITIONS;
newData.setRange(range, true);
continue;
}
for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) {
int oldValue=newData.get(c);
int newValue=oldValue;
if(norm16>=minMaybeYes) {
// not a segment starter if it occurs in a decomposition or has cc!=0
newValue|=CANON_NOT_SEGMENT_STARTER;
if(norm16<MIN_NORMAL_MAYBE_YES) {
newValue|=CANON_HAS_COMPOSITIONS;
}
} else if(norm16<minYesNo) {
newValue|=CANON_HAS_COMPOSITIONS;
} else {
// c has a decomposition
int c2=c;
while(limitNoNo<=norm16 && norm16<minMaybeYes) {
c2=this.mapAlgorithmic(c2, norm16);
norm16=getNorm16(c2);
}
if(minYesNo<=norm16 && norm16<limitNoNo) {
// c decomposes, get everything from the variable-length extra data
int firstUnit=extraData.charAt(norm16++);
if(c==c2 && (firstUnit&MAPPING_PLUS_COMPOSITION_LIST)!=0) {
newValue|=CANON_HAS_COMPOSITIONS; // original c has compositions
}
int length=firstUnit&MAPPING_LENGTH_MASK;
if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) {
if(c==c2 && (extraData.charAt(norm16)&0xff)!=0) {
newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0
}
++norm16;
}
if(length!=0) {
// add c to first code point's start set
int limit=norm16+length;
c2=extraData.codePointAt(norm16);
addToStartSet(newData, c, c2);
// set CANON_NOT_SEGMENT_STARTER for each remaining code point
while((norm16+=Character.charCount(c2))<limit) {
c2=extraData.codePointAt(norm16);
int c2Value=newData.get(c2);
if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER);
}
}
}
} else {
// c decomposed to c2 algorithmically; c has cc==0
addToStartSet(newData, c, c2);
}
}
if(newValue!=oldValue) {
newData.set(c, newValue);
}
}
}
canonIterData=newData.toTrie2_32();
}
return this;
}
public int getNorm16(int c) { return normTrie.get(c); }
public int getCompQuickCheck(int norm16) {
@ -605,6 +689,10 @@ public final class Normalizer2Impl {
}
}
public boolean isCanonSegmentStarter(int c) {
return canonIterData.get(c)>=0;
}
public static final int MIN_CCC_LCCC_CP=0x300;
public static final int MIN_YES_YES_WITH_CC=0xff01;
@ -1782,6 +1870,30 @@ public final class Normalizer2Impl {
return p;
}
private void addToStartSet(Trie2Writable newData, int origin, int decompLead) {
int canonValue=newData.get(decompLead);
if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
// origin is the first character whose decomposition starts with
// the character for which we are setting the value.
newData.set(decompLead, canonValue|origin);
} else {
// origin is not the first character, or it is U+0000.
UnicodeSet set;
if((canonValue&CANON_HAS_SET)==0) {
int firstOrigin=canonValue&CANON_VALUE_MASK;
canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size();
newData.set(decompLead, canonValue);
canonStartSets.add(set=new UnicodeSet());
if(firstOrigin!=0) {
set.add(firstOrigin);
}
} else {
set=canonStartSets.get(canonValue&CANON_VALUE_MASK);
}
set.add(origin);
}
}
@SuppressWarnings("unused")
private VersionInfo dataVersion;
@ -1800,4 +1912,12 @@ public final class Normalizer2Impl {
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
private Trie2_16 fcdTrie;
private Trie2_32 canonIterData;
private ArrayList<UnicodeSet> canonStartSets;
// bits in canonIterData
private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000;
private static final int CANON_HAS_COMPOSITIONS = 0x40000000;
private static final int CANON_HAS_SET = 0x200000;
private static final int CANON_VALUE_MASK = 0x1fffff;
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
* Copyright (C) 1996-2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
* Copyright (C) 1996-2010, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.dev.test.normalizer;
@ -12,6 +12,9 @@ import java.util.SortedSet;
import java.util.TreeSet;
import java.util.Set;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.Norm2AllModes;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.impl.NormalizerImpl;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.CanonicalIterator;
@ -37,8 +40,18 @@ public class TestCanonicalIterator extends TestFmwk {
{"\u010d\u017E", "c\u030Cz\u030C, c\u030C\u017E, \u010Dz\u030C, \u010D\u017E"},
{"x\u0307\u0327", "x\u0307\u0327, x\u0327\u0307, \u1E8B\u0327"},
};
public void TestOldAndNew() {
Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstanceNoIOException().impl;
nfcImpl.ensureCanonIterData();
for (int c = 0; c <= 0x10ffff; ++c) {
if (nfcImpl.isCanonSegmentStarter(c) != NormalizerImpl.isCanonSafeStart(c)) {
errln(String.format("old!=new segment starter for U+%04x: old %b new %b",
c, NormalizerImpl.isCanonSafeStart(c), nfcImpl.isCanonSegmentStarter(c)));
}
}
}
public void TestExhaustive() {
int counter = 0;
CanonicalIterator it = new CanonicalIterator("");