ICU-8246 add Normalizer2.getNFCInstance(), getNFKDInstance(), ...

X-SVN-Rev: 30996
This commit is contained in:
Markus Scherer 2011-12-01 01:50:37 +00:00
parent c50c6a20d7
commit 81e9b13f7b
8 changed files with 121 additions and 19 deletions

View file

@ -105,6 +105,66 @@ public abstract class Normalizer2 {
COMPOSE_CONTIGUOUS
};
/**
* Returns a Normalizer2 instance for Unicode NFC normalization.
* Same as getInstance(null, "nfc", Mode.COMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
public static Normalizer2 getNFCInstance() {
return Norm2AllModes.getNFCInstance().comp;
}
/**
* Returns a Normalizer2 instance for Unicode NFD normalization.
* Same as getInstance(null, "nfc", Mode.DECOMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
public static Normalizer2 getNFDInstance() {
return Norm2AllModes.getNFCInstance().decomp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKC normalization.
* Same as getInstance(null, "nfkc", Mode.COMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
public static Normalizer2 getNFKCInstance() {
return Norm2AllModes.getNFKCInstance().comp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKD normalization.
* Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
public static Normalizer2 getNFKDInstance() {
return Norm2AllModes.getNFKCInstance().decomp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
* Same as getInstance(null, "nfkc_cf", Mode.COMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @draft ICU 49
* @provisional This API might change or be removed in a future release.
*/
public static Normalizer2 getNFKCCasefoldInstance() {
return Norm2AllModes.getNFKC_CFInstance().comp;
}
/**
* Returns a Normalizer2 instance which uses the specified data file
* (an ICU data file if data=null, or else custom binary data)

View file

@ -10,7 +10,6 @@ import java.util.HashSet;
import java.util.Set;
import com.ibm.icu.lang.CharSequences;
import com.ibm.icu.text.Normalizer2.Mode;
/**
* Simple internal utility class for helping with getSource/TargetSet
@ -20,7 +19,7 @@ class SourceTargetUtility {
final UnicodeSet sourceCache;
final Set<String> sourceStrings;
static final UnicodeSet NON_STARTERS = new UnicodeSet("[:^ccc=0:]").freeze();
static Normalizer2 NFC = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE);
static Normalizer2 NFC = Normalizer2.getNFCInstance();
//static final UnicodeSet TRAILING_COMBINING = new UnicodeSet();
public SourceTargetUtility(Transform<String, String> transform) {

View file

@ -30,7 +30,6 @@ import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType;
import com.ibm.icu.text.AlphabeticIndex.Record;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Normalizer2.Mode;
import com.ibm.icu.text.RawCollationKey;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.text.UnicodeSet;
@ -662,7 +661,7 @@ public class AlphabeticIndexTest extends TestFmwk {
ruleBasedCollator.getContractionsAndExpansions(extras, expansions, true);
extras.addAll(expansions).removeAll(TO_TRY);
if (extras.size() != 0) {
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Mode.COMPOSE);
Normalizer2 normalizer = Normalizer2.getNFKCInstance();
for (String current : extras) {
if (!TO_TRY.containsAll(current))
continue;

View file

@ -678,8 +678,8 @@ public final class UCharacterTest extends TestFmwk
final String DIR =
"L R EN ES ET AN CS B S WS ON LRE LRO AL RLE RLO PDF NSM BN ";
Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
Normalizer2 nfkc = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE);
Normalizer2 nfc = Normalizer2.getNFCInstance();
Normalizer2 nfkc = Normalizer2.getNFKCInstance();
try
{
@ -2492,7 +2492,7 @@ public final class UCharacterTest extends TestFmwk
* In general, the set for the middle such character should be a subset
* of the set for the first.
*/
Normalizer2 norm2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
Normalizer2 norm2=Normalizer2.getNFDInstance();
set1=new UnicodeSet();
Norm2AllModes.getNFCInstance().impl.
ensureCanonIterData().getCanonStartSet(0x49, set1);

View file

@ -2015,7 +2015,7 @@ public class BasicTest extends TestFmwk {
}
// test all of these precomposed characters
Normalizer2 nfcNorm2 = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
Normalizer2 nfcNorm2 = Normalizer2.getNFCInstance();
UnicodeSetIterator it = new UnicodeSetIterator(set);
int c;
while(it.next() && (c=it.codepoint)!=UnicodeSetIterator.IS_STRING) {
@ -2322,7 +2322,7 @@ public class BasicTest extends TestFmwk {
// For each character about which we are unsure, see if it changes when we add
// one of the back-combining characters.
Normalizer2 norm2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
Normalizer2 norm2=Normalizer2.getNFCInstance();
StringBuilder s=new StringBuilder();
iter.reset(unsure);
while(iter.next()) {
@ -2526,7 +2526,7 @@ public class BasicTest extends TestFmwk {
}
public void TestGetRawDecomposition() {
Normalizer2 n2=Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE);
Normalizer2 n2=Normalizer2.getNFKCInstance();
/*
* Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values,
* without recursive decomposition.
@ -2637,7 +2637,7 @@ public class BasicTest extends TestFmwk {
}
public void TestFilteredNormalizer2() {
Normalizer2 nfcNorm2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
Normalizer2 nfcNorm2=Normalizer2.getNFCInstance();
UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]");
FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
int c;
@ -2650,4 +2650,45 @@ public class BasicTest extends TestFmwk {
expectedCC, cc);
}
}
public void TestGetEasyToUseInstance() {
// Test input string:
// U+00A0 -> <noBreak> 0020
// U+00C7 0301 = 1E08 = 0043 0327 0301
String in="\u00A0\u00C7\u0301";
Normalizer2 n2=Normalizer2.getNFCInstance();
String out=n2.normalize(in);
assertEquals(
"getNFCInstance() did not return an NFC instance " +
"(normalizes to " + prettify(out) + ')',
"\u00A0\u1E08", out);
n2=Normalizer2.getNFDInstance();
out=n2.normalize(in);
assertEquals(
"getNFDInstance() did not return an NFD instance " +
"(normalizes to " + prettify(out) + ')',
"\u00A0C\u0327\u0301", out);
n2=Normalizer2.getNFKCInstance();
out=n2.normalize(in);
assertEquals(
"getNFKCInstance() did not return an NFKC instance " +
"(normalizes to " + prettify(out) + ')',
" \u1E08", out);
n2=Normalizer2.getNFKDInstance();
out=n2.normalize(in);
assertEquals(
"getNFKDInstance() did not return an NFKD instance " +
"(normalizes to " + prettify(out) + ')',
" C\u0327\u0301", out);
n2=Normalizer2.getNFKCCasefoldInstance();
out=n2.normalize(in);
assertEquals(
"getNFKCCasefoldInstance() did not return an NFKC_Casefold instance " +
"(normalizes to " + prettify(out) + ')',
" \u1E09", out);
}
}

View file

@ -484,7 +484,7 @@ public class SpoofCheckerTest extends TestFmwk {
"\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
+ "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
matcher("");
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
Normalizer2 normalizer = Normalizer2.getNFDInstance();
int lineNum = 0;
String inputLine;
while ((inputLine = confusablesRdr.readLine()) != null) {

View file

@ -25,7 +25,6 @@ import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.CanonicalIterator;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.Normalizer2.Mode;
import com.ibm.icu.text.Replaceable;
import com.ibm.icu.text.ReplaceableString;
import com.ibm.icu.text.StringTransform;
@ -3016,8 +3015,8 @@ public class TransliteratorTest extends TestFmwk {
public void TestSourceTargetSet2() {
Normalizer2 nfc = Normalizer2.getInstance(null, "nfc", Mode.COMPOSE);
Normalizer2 nfd = Normalizer2.getInstance(null, "nfc", Mode.DECOMPOSE);
Normalizer2 nfc = Normalizer2.getNFCInstance();
Normalizer2 nfd = Normalizer2.getNFDInstance();
// Normalizer2 nfkd = Normalizer2.getInstance(null, "nfkd", Mode.DECOMPOSE);
// UnicodeSet nfkdSource = new UnicodeSet();

View file

@ -19,10 +19,14 @@ public class IcuUnicodeNormalizerFactory implements UnicodeTransform.Factory {
public UnicodeTransform getInstance(Type type) {
switch (type) {
case NFC: case NFKC:
return new IcuUnicodeNormalizer(Normalizer2.getInstance(null, type.toString(), Mode.COMPOSE));
case NFD: case NFKD:
return new IcuUnicodeNormalizer(Normalizer2.getInstance(null, type == Type.NFD ? "NFC" : "NFKC", Mode.DECOMPOSE));
case NFC:
return new IcuUnicodeNormalizer(Normalizer2.getNFCInstance());
case NFKC:
return new IcuUnicodeNormalizer(Normalizer2.getNFKCInstance());
case NFD:
return new IcuUnicodeNormalizer(Normalizer2.getNFDInstance());
case NFKD:
return new IcuUnicodeNormalizer(Normalizer2.getNFKDInstance());
case CASEFOLD:
return new CaseFolder();
default: