From e3546c39d6ed832bf816a5c3e89ad66588520505 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Thu, 8 Sep 2011 22:28:40 +0000 Subject: [PATCH] ICU-8807 Add internal changes for use in unicode tools. Adds internal API, but doesn't change behavior if that is not called. X-SVN-Rev: 30638 --- .gitattributes | 4 + .../com/ibm/icu/impl/ImplicitCEGenerator.java | 4 +- .../src/com/ibm/icu/impl/UnicodeRegex.java | 38 ++- .../core/src/com/ibm/icu/text/UnicodeSet.java | 35 ++- .../ibm/icu/dev/test/translit/TestAll.java | 5 +- .../test/translit/TestUnicodeProperty.java | 1 + .../icu/dev/test/util/ICUPropertyFactory.java | 4 +- .../util/IcuUnicodeNormalizerFactory.java | 1 + .../com/ibm/icu/dev/test/util/UnicodeMap.java | 4 +- .../icu/dev/test/util/UnicodeProperty.java | 223 +++++++++++++++--- .../test/util/UnicodePropertySymbolTable.java | 1 + .../icu/dev/test/util/UnicodeTransform.java | 1 + 12 files changed, 268 insertions(+), 53 deletions(-) create mode 100644 icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestUnicodeProperty.java create mode 100644 icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/IcuUnicodeNormalizerFactory.java create mode 100644 icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodePropertySymbolTable.java create mode 100644 icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeTransform.java diff --git a/.gitattributes b/.gitattributes index 3f7ff05349b..84b3e770060 100644 --- a/.gitattributes +++ b/.gitattributes @@ -714,6 +714,10 @@ icu4j/main/tests/translit/.externalToolBuilders/copy-translit-test-data.launch - icu4j/main/tests/translit/.settings/org.eclipse.core.resources.prefs -text icu4j/main/tests/translit/.settings/org.eclipse.jdt.core.prefs -text icu4j/main/tests/translit/.settings/org.eclipse.jdt.ui.prefs -text +icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestUnicodeProperty.java -text +icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/IcuUnicodeNormalizerFactory.java -text +icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodePropertySymbolTable.java -text +icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeTransform.java -text icu4j/main/tests/translit/translit-tests-build.launch -text icu4j/manifest.stub -text icu4j/tools/build/.settings/org.eclipse.core.resources.prefs -text diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/ImplicitCEGenerator.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/ImplicitCEGenerator.java index 41bd01e424b..63e5691ea82 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/ImplicitCEGenerator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/ImplicitCEGenerator.java @@ -1,6 +1,6 @@ /** ******************************************************************************* - * Copyright (C) 2004-2010, International Business Machines Corporation and * + * Copyright (C) 2004-2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -46,7 +46,7 @@ public class ImplicitCEGenerator { // 4E00;;Lo;0;L;;;;;N;;;;; // 9FCB;;Lo;0;L;;;;;N;;;;; CJK_BASE = 0x4E00, - CJK_LIMIT = 0x9FCB+1, + CJK_LIMIT = 0x9FCC+1, CJK_COMPAT_USED_BASE = 0xFA0E, CJK_COMPAT_USED_LIMIT = 0xFA2F+1, diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java index 925ad5a1e4b..252be8d9815 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UnicodeRegex.java @@ -1,6 +1,6 @@ /* ******************************************************************************** - * Copyright (C) 2009-2010, Google, International Business Machines Corporation * + * Copyright (C) 2009-2011, Google, International Business Machines Corporation * * and others. All Rights Reserved. * ******************************************************************************** */ @@ -24,6 +24,7 @@ import java.util.TreeMap; import java.util.regex.Pattern; import com.ibm.icu.text.StringTransform; +import com.ibm.icu.text.SymbolTable; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.Freezable; @@ -37,6 +38,26 @@ public class UnicodeRegex implements Cloneable, Freezable, StringT // Note: we don't currently have any state, but intend to in the future, // particularly for the regex style supported. + private SymbolTable symbolTable; + private ParsePosition parsePosition = new ParsePosition(0); + + /** + * Set the symbol table for internal processing + * @internal + */ + public SymbolTable getSymbolTable() { + return symbolTable; + } + + /** + * Get the symbol table for internal processing + * @internal + */ + public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { + this.symbolTable = symbolTable; + return this; + } + /** * Adds full Unicode property support, with the latest version of Unicode, * to Java Regex, bringing it up to Level 1 (see @@ -185,12 +206,12 @@ public class UnicodeRegex implements Cloneable, Freezable, StringT // brute force replacement; do twice to allow for different order // later on can optimize for (int i = 0; i < 2; ++i) { - for (Iterator it = variables.keySet().iterator(); it.hasNext();) { - String variable = it.next(); + for (String variable : variables.keySet()) { String definition = variables.get(variable); - for (Iterator it2 = variables.keySet().iterator(); it2.hasNext();) { - String variable2 = it2.next(); - if (variable.equals(variable2)) continue; + for (String variable2 : variables.keySet()) { + if (variable.equals(variable2)) { + continue; + } String definition2 = variables.get(variable2); String altered2 = definition2.replace(variable, definition); if (!altered2.equals(definition2)) { @@ -303,7 +324,7 @@ public class UnicodeRegex implements Cloneable, Freezable, StringT private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { try { pos.setIndex(i); - UnicodeSet x = temp.clear().applyPattern(regex, pos, null, 0); + UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); x.complement().complement(); // hack to fix toPattern result.append(x.toPattern(false)); i = pos.getIndex() - 1; // allow for the loop increment @@ -335,8 +356,7 @@ public class UnicodeRegex implements Cloneable, Freezable, StringT String variable = null; StringBuffer definition = new StringBuffer(); int count = 0; - for (Iterator it = lines.iterator(); it.hasNext();) { - String line = it.next(); + for (String line : lines) { ++count; // remove initial bom, comments if (line.length() == 0) continue; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java index 5a9dd609c9a..6654e5cd398 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnicodeSet.java @@ -285,6 +285,8 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa * @provisional This API might change or be removed in a future release. */ public static final UnicodeSet ALL_CODE_POINTS = new UnicodeSet(0, 0x10FFFF).freeze(); + + private static XSymbolTable XSYMBOL_TABLE = null; // for overriding the the function processing private static final int LOW = 0x000000; // LOW <= all valid values. ZERO for codepoints private static final int HIGH = 0x110000; // HIGH > all valid values. 10000 for code units. @@ -3282,7 +3284,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa public UnicodeSet applyPropertyAlias(String propertyAlias, String valueAlias) { return applyPropertyAlias(propertyAlias, valueAlias, null); } - + /** * Modifies this set to contain those code points which have the * given value for the given property. Prior contents of this @@ -3306,6 +3308,12 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa && ((XSymbolTable)symbols).applyPropertyAlias(propertyAlias, valueAlias, this)) { return this; } + + if (XSYMBOL_TABLE != null) { + if (XSYMBOL_TABLE.applyPropertyAlias(propertyAlias, valueAlias, this)) { + return this; + } + } if (valueAlias.length() > 0) { p = UCharacter.getPropertyEnum(propertyAlias); @@ -4540,5 +4548,30 @@ public class UnicodeSet extends UnicodeFilter implements Iterable, Compa */ CONDITION_COUNT } + + /** + * Get the default symbol table. Null means ordinary processing. For internal use only. + * @return + * @internal + */ + public static XSymbolTable getDefaultXSymbolTable() { + return XSYMBOL_TABLE; + } + + /** + * Set the default symbol table. Null means ordinary processing. For internal use only. Will affect all subsequent parsing + * of UnicodeSets. + *

+ * WARNING: If this function is used with a {@link UnicodeProperty}, and the + * Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call + * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} + * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. + * + * @param xSymbolTable the new default symbol table. + * @internal + */ + public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) { + XSYMBOL_TABLE = xSymbolTable; + } } //eof diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestAll.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestAll.java index cb8b2c410dd..0742a162cd9 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestAll.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestAll.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 1996-2010, International Business Machines Corporation and * + * Copyright (C) 1996-2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -28,7 +28,8 @@ public class TestAll extends TestGroup { "TransliteratorTest", "RegexUtilitiesTest", "UnicodeMapTest", - "ThreadTest" + "ThreadTest", + "TestUnicodeProperty" }); } diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestUnicodeProperty.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestUnicodeProperty.java new file mode 100644 index 00000000000..78597117b37 --- /dev/null +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/translit/TestUnicodeProperty.java @@ -0,0 +1 @@ +/* ******************************************************************************* * Copyright (C) 2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.translit; import java.util.List; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.dev.test.util.ICUPropertyFactory; import com.ibm.icu.dev.test.util.UnicodeProperty; import com.ibm.icu.dev.test.util.UnicodeProperty.Factory; import com.ibm.icu.dev.test.util.UnicodePropertySymbolTable; import com.ibm.icu.text.UnicodeSet; /** * @author markdavis * */ public class TestUnicodeProperty extends TestFmwk{ public static void main(String[] args) { new TestUnicodeProperty().run(args); } static final UnicodeSet casedLetter = new UnicodeSet("[:gc=cased letter:]"); static final UnicodeSet letter = new UnicodeSet("[:gc=L:]"); public void TestBasic() { Factory factory = ICUPropertyFactory.make(); UnicodeProperty property = factory.getProperty("gc"); List values = property.getAvailableValues(); assertTrue("Values contain GC values", values.contains("Unassigned")); final UnicodeSet lu = property.getSet("Lu"); if (!assertTrue("Gc=L contains 'A'", lu.contains('A'))) { errln("Contents:\t" + lu.complement().complement().toPattern(false)); } } public void TestSymbolTable() { Factory factory = ICUPropertyFactory.make(); UnicodePropertySymbolTable upst = new UnicodePropertySymbolTable(factory); UnicodeSet.setDefaultXSymbolTable(upst); try { final UnicodeSet luSet = new UnicodeSet("[:gc=L:]"); assertTrue("Gc=L contains 'A'", luSet.contains('A')); assertTrue("Gc=L contains 'Z'", luSet.contains('Z')); assertFalse("Gc=L contains 'a'", luSet.contains('1')); UnicodeSet casedLetter2 = new UnicodeSet("[:gc=cased letter:]"); assertEquals("gc=lc are equal", casedLetter, casedLetter2); } finally { // restore the world UnicodeSet.setDefaultXSymbolTable(null); } } public void TestSymbolTable2() { Factory factory = new MyUnicodePropertyFactory(); UnicodePropertySymbolTable upst = new UnicodePropertySymbolTable(factory); UnicodeSet.setDefaultXSymbolTable(upst); try { final UnicodeSet luSet = new UnicodeSet("[:gc=L:]"); assertFalse("Gc=L contains 'A'", luSet.contains('A')); if (!assertTrue("Gc=L contains 'Z'", luSet.contains('Z'))) { errln("Contents:\t" + luSet.complement().complement().toPattern(false)); } assertFalse("Gc=L contains 'a'", luSet.contains('1')); UnicodeSet casedLetter2 = new UnicodeSet("[:gc=cased letter:]"); assertNotEquals("gc=lc should not be equal", casedLetter, casedLetter2); } finally { // restore the world UnicodeSet.setDefaultXSymbolTable(null); } } /** * For testing, override to set A-M to Cn. */ static class MyUnicodeGCProperty extends UnicodeProperty.SimpleProperty { UnicodeProperty icuProperty = ICUPropertyFactory.make().getProperty("Gc"); { setName(icuProperty.getName()); setType(icuProperty.getType()); } @Override protected String _getValue(int codepoint) { if (codepoint >= 'A' && codepoint <= 'M') { return "Unassigned"; } else { return icuProperty.getValue(codepoint); } } @Override protected List _getValueAliases(String valueAlias, List result) { return icuProperty.getValueAliases(valueAlias, result); } @Override public List _getNameAliases(List result) { return icuProperty.getNameAliases(); } } /** * For testing, override to set A-Z to Cn. */ static class MyUnicodePropertyFactory extends ICUPropertyFactory { private MyUnicodePropertyFactory() { add(new MyUnicodeGCProperty()); } } static class MyUnicodePropertySymbolTable extends UnicodePropertySymbolTable { public MyUnicodePropertySymbolTable(Factory factory) { super(factory); } } } \ No newline at end of file diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/ICUPropertyFactory.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/ICUPropertyFactory.java index 266d49cd8b3..bb2d86f4bf4 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/ICUPropertyFactory.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/ICUPropertyFactory.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 2002-2010, International Business Machines Corporation and * + * Copyright (C) 2002-2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -437,7 +437,7 @@ public class ICUPropertyFactory extends UnicodeProperty.Factory { // NFKD = UProperty.STRING_LIMIT+3 ; - private ICUPropertyFactory() { + protected ICUPropertyFactory() { Collection c = getInternalAvailablePropertyAliases(new ArrayList()); Iterator it = c.iterator(); while (it.hasNext()) { diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/IcuUnicodeNormalizerFactory.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/IcuUnicodeNormalizerFactory.java new file mode 100644 index 00000000000..dc803bfe6b3 --- /dev/null +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/IcuUnicodeNormalizerFactory.java @@ -0,0 +1 @@ +/* ******************************************************************************* * Copyright (C) 2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.util; import com.ibm.icu.dev.test.util.UnicodeTransform.Type; import com.ibm.icu.lang.UCharacter; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.Normalizer2.Mode; /** * @author markdavis * */ public class IcuUnicodeNormalizerFactory implements UnicodeTransform.Factory { public UnicodeTransform getInstance(Type type) { switch (type) { case NFC: case NFKC: return new IcuUnicodeNormalizer(Normalizer2.getInstance(null, type.toString(), Mode.COMPOSE)); case NFD: case NFKD: return new IcuUnicodeNormalizer(Normalizer2.getInstance(null, type == Type.NFD ? "NFC" : "NFKC", Mode.DECOMPOSE)); case CASEFOLD: return new CaseFolder(); default: throw new IllegalArgumentException(); } } private static class CaseFolder extends UnicodeTransform { @Override public String transform(String source) { return UCharacter.foldCase(source.toString(), true); } } private static class IcuUnicodeNormalizer extends UnicodeTransform { private Normalizer2 normalizer; private IcuUnicodeNormalizer(Normalizer2 normalizer) { this.normalizer = normalizer; } public String transform(String src) { return normalizer.normalize(src); } public boolean isTransformed(String s) { return normalizer.isNormalized(s); } } } \ No newline at end of file diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeMap.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeMap.java index cd18571702e..97238ef67b3 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeMap.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeMap.java @@ -1,6 +1,6 @@ /* ******************************************************************************* - * Copyright (C) 1996-2010, International Business Machines Corporation and * + * Copyright (C) 1996-2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -277,7 +277,7 @@ public final class UnicodeMap implements Cloneable, Freezable, StringTransfor throw new UnsupportedOperationException("Attempt to modify locked object"); } if (errorOnReset && values[baseIndex] != null) { - throw new IllegalArgumentException("Attempt to reset value for " + Utility.hex(codepoint) + throw new UnsupportedOperationException("Attempt to reset value for " + Utility.hex(codepoint) + " when that is disallowed. Old: " + values[baseIndex] + "; New: " + value); } diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeProperty.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeProperty.java index d3f9a18af77..89ddd946e26 100644 --- a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeProperty.java +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeProperty.java @@ -10,19 +10,25 @@ import java.io.PrintWriter; import java.io.StringWriter; import java.text.ParsePosition; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.regex.Pattern; +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.UnicodeLabel; +import com.ibm.icu.dev.test.util.UnicodeMap; import com.ibm.icu.dev.test.util.CollectionUtilities.InverseMatcher; import com.ibm.icu.dev.test.util.CollectionUtilities.ObjectMatcher; import com.ibm.icu.impl.Utility; import com.ibm.icu.text.SymbolTable; +import com.ibm.icu.text.Transform; import com.ibm.icu.text.UFormat; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeMatcher; @@ -31,17 +37,87 @@ import com.ibm.icu.text.UnicodeSetIterator; public abstract class UnicodeProperty extends UnicodeLabel { - public static final UnicodeSet UNASSIGNED = new UnicodeSet("[:gc=unassigned:]").freeze(); + public static final UnicodeSet NONCHARACTERS = new UnicodeSet("[:noncharactercodepoint:]").freeze(); public static final UnicodeSet PRIVATE_USE = new UnicodeSet("[:gc=privateuse:]").freeze(); public static final UnicodeSet SURROGATE = new UnicodeSet("[:gc=surrogate:]").freeze(); - public static final UnicodeSet SPECIALS = new UnicodeSet(UNASSIGNED).addAll(PRIVATE_USE).addAll(SURROGATE).freeze(); - public static final int SAMPLE_UNASSIGNED = UNASSIGNED.charAt(0); - public static final int SAMPLE_PRIVATE_USE = 0xE000; - public static final int SAMPLE_SURROGATE = 0xD800; - public static final UnicodeSet STUFF_TO_TEST = new UnicodeSet(SPECIALS).complement() - .add(SAMPLE_UNASSIGNED).add(SAMPLE_PRIVATE_USE).add(SAMPLE_SURROGATE).freeze(); - public static final UnicodeSet STUFF_TO_TEST_WITH_UNASSIGNED = new UnicodeSet("[:any:]").freeze(); + public static final UnicodeSet HIGH_SURROGATES = new UnicodeSet("[\\uD800-\\uDB7F]").freeze(); + public static final int SAMPLE_HIGH_SURROGATE = HIGH_SURROGATES.charAt(0); + public static final UnicodeSet HIGH_PRIVATE_USE_SURROGATES = new UnicodeSet("[\\uDB80-\\uDBFF]").freeze(); + public static final int SAMPLE_HIGH_PRIVATE_USE_SURROGATE = HIGH_PRIVATE_USE_SURROGATES.charAt(0); + public static final UnicodeSet LOW_SURROGATES = new UnicodeSet("[\\uDC00-\\uDFFF]").freeze(); + public static final int SAMPLE_LOW_SURROGATE = LOW_SURROGATES.charAt(0); + + public static final UnicodeSet PRIVATE_USE_AREA = new UnicodeSet("[\\uE000-\\uF8FF]").freeze(); + public static final int SAMPLE_PRIVATE_USE_AREA = PRIVATE_USE_AREA.charAt(0); + public static final UnicodeSet PRIVATE_USE_AREA_A = new UnicodeSet("[\\U000F0000-\\U000FFFFD]").freeze(); + public static final int SAMPLE_PRIVATE_USE_AREA_A = PRIVATE_USE_AREA_A.charAt(0); + public static final UnicodeSet PRIVATE_USE_AREA_B = new UnicodeSet("[\\U00100000-\\U0010FFFD]").freeze(); + public static final int SAMPLE_PRIVATE_USE_AREA_B = PRIVATE_USE_AREA_B.charAt(0); + + // The following are special. They are used for performance, but must be changed if the version of Unicode for the UnicodeProperty changes. + private static UnicodeSet UNASSIGNED; + private static int SAMPLE_UNASSIGNED; + private static UnicodeSet SPECIALS; + private static UnicodeSet STUFF_TO_TEST; + private static UnicodeSet STUFF_TO_TEST_WITH_UNASSIGNED; + + public static synchronized UnicodeSet getUNASSIGNED() { + if (UNASSIGNED == null) { + UNASSIGNED = new UnicodeSet("[:gc=unassigned:]").freeze(); + } + return UNASSIGNED; + } + + public static synchronized int getSAMPLE_UNASSIGNED() { + if (SAMPLE_UNASSIGNED == 0) { + SAMPLE_UNASSIGNED = getUNASSIGNED().charAt(0); + } + return SAMPLE_UNASSIGNED; + } + + public static synchronized UnicodeSet getSPECIALS() { + if (SPECIALS == null) { + SPECIALS = new UnicodeSet(getUNASSIGNED()).addAll(PRIVATE_USE).addAll(SURROGATE).freeze(); + } + return SPECIALS; + } + + public static synchronized UnicodeSet getSTUFF_TO_TEST() { + if (STUFF_TO_TEST == null) { + STUFF_TO_TEST = new UnicodeSet(getSPECIALS()).complement() + .addAll(NONCHARACTERS) + .add(getSAMPLE_UNASSIGNED()) + .add(SAMPLE_HIGH_SURROGATE) + .add(SAMPLE_HIGH_PRIVATE_USE_SURROGATE) + .add(SAMPLE_LOW_SURROGATE) + .add(SAMPLE_PRIVATE_USE_AREA) + .add(SAMPLE_PRIVATE_USE_AREA_A) + .add(SAMPLE_PRIVATE_USE_AREA_B) + .freeze(); + } + return STUFF_TO_TEST; + } + + public static synchronized UnicodeSet getSTUFF_TO_TEST_WITH_UNASSIGNED() { + if (STUFF_TO_TEST_WITH_UNASSIGNED == null) { + STUFF_TO_TEST_WITH_UNASSIGNED = new UnicodeSet(getSTUFF_TO_TEST()).addAll(getUNASSIGNED()).freeze(); + } + return STUFF_TO_TEST_WITH_UNASSIGNED; + } + + /** + * Reset the cache properties. Must be done if the version of Unicode is different than the ICU one, AND any UnicodeProperty has already been instantiated. + * TODO make this a bit more robust. + * @internal + */ + public static synchronized void ResetCacheProperties() { + UNASSIGNED = null; + SAMPLE_UNASSIGNED = 0; + SPECIALS = null; + STUFF_TO_TEST = null; + STUFF_TO_TEST_WITH_UNASSIGNED = null; + } public static boolean DEBUG = false; @@ -57,7 +133,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { private Map valueToFirstValueAlias = null; - private boolean hasUniformUnassigned = false; + private boolean hasUniformUnassigned = true; /* * Name: Unicode_1_Name Name: ISO_Comment Name: Name Name: Unicode_1_Name @@ -238,7 +314,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { return maxFirstValueAliasWidth; return maxValueWidth; } - + public final UnicodeSet getSet(String propertyValue) { return getSet(propertyValue, null); } @@ -247,6 +323,8 @@ public abstract class UnicodeProperty extends UnicodeLabel { return getSet(matcher, null); } + /** Adds the property value set to the result. Clear the result first if you don't want to keep the original contents. + */ public final UnicodeSet getSet(String propertyValue, UnicodeSet result) { return getSet(new SimpleMatcher(propertyValue, isType(STRING_OR_MISC_MASK) ? null : PROPERTY_COMPARATOR), @@ -257,7 +335,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { public static final String UNUSED = "??"; - public final UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { + public UnicodeSet getSet(PatternMatcher matcher, UnicodeSet result) { if (result == null) result = new UnicodeSet(); boolean uniformUnassigned = hasUniformUnassigned(); @@ -422,7 +500,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { } private static UnicodeSetIterator getStuffToTest(boolean uniformUnassigned) { - return new UnicodeSetIterator(uniformUnassigned ? STUFF_TO_TEST : STUFF_TO_TEST_WITH_UNASSIGNED); + return new UnicodeSetIterator(uniformUnassigned ? getSTUFF_TO_TEST() : getSTUFF_TO_TEST_WITH_UNASSIGNED()); } /** @@ -654,7 +732,9 @@ public abstract class UnicodeProperty extends UnicodeLabel { Map propertyCache = new HashMap(1); public final Factory add(UnicodeProperty sp) { - canonicalNames.put(sp.getName(), sp); + String name2 = sp.getName(); + canonicalNames.put(name2, sp); + skeletonNames.put(toSkeleton(name2), sp); List c = sp.getNameAliases(new ArrayList(1)); Iterator it = c.iterator(); while (it.hasNext()) { @@ -1178,7 +1258,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { } public static abstract class SimpleProperty extends BaseProperty { - List values; + LinkedHashSet values; public UnicodeProperty addName(String alias) { propertyAliases.add(alias); @@ -1209,7 +1289,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { } public SimpleProperty setValues(List valueAliases) { - this.values = new ArrayList(valueAliases); + this.values = new LinkedHashSet(valueAliases); for (Iterator it = this.values.iterator(); it.hasNext();) { _addToValues((String) it.next(), null); } @@ -1233,7 +1313,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { private void _addToValues(String item, String alias) { if (values == null) - values = new ArrayList(1); + values = new LinkedHashSet(); if (toValueAliases == null) _fixValueAliases(); addUnique(item, values); @@ -1328,32 +1408,57 @@ public abstract class UnicodeProperty extends UnicodeLabel { public static UnicodeSet addUntested(UnicodeSet result, boolean uniformUnassigned) { - if (!uniformUnassigned) return result; + if (uniformUnassigned && result.contains(UnicodeProperty.getSAMPLE_UNASSIGNED())) { + result.addAll(UnicodeProperty.getUNASSIGNED()); + } + + if (result.contains(UnicodeProperty.SAMPLE_HIGH_SURROGATE)) { + result.addAll(UnicodeProperty.HIGH_SURROGATES); + } + if (result.contains(UnicodeProperty.SAMPLE_HIGH_PRIVATE_USE_SURROGATE)) { + result.addAll(UnicodeProperty.HIGH_PRIVATE_USE_SURROGATES); + } + if (result.contains(UnicodeProperty.SAMPLE_LOW_SURROGATE)) { + result.addAll(UnicodeProperty.LOW_SURROGATES); + } + + if (result.contains(UnicodeProperty.SAMPLE_PRIVATE_USE_AREA)) { + result.addAll(UnicodeProperty.PRIVATE_USE_AREA); + } + if (result.contains(UnicodeProperty.SAMPLE_PRIVATE_USE_AREA_A)) { + result.addAll(UnicodeProperty.PRIVATE_USE_AREA_A); + } + if (result.contains(UnicodeProperty.SAMPLE_PRIVATE_USE_AREA_B)) { + result.addAll(UnicodeProperty.PRIVATE_USE_AREA_B); + } - if (result.contains(UnicodeProperty.SAMPLE_UNASSIGNED)) { - result.addAll(UnicodeProperty.UNASSIGNED); - } - if (result.contains(UnicodeProperty.SAMPLE_PRIVATE_USE)) { - result.addAll(UnicodeProperty.PRIVATE_USE); - } - if (result.contains(UnicodeProperty.SAMPLE_SURROGATE)) { - result.addAll(UnicodeProperty.SURROGATE); - } return result; } public static UnicodeMap addUntested(UnicodeMap result, boolean uniformUnassigned) { - if (!uniformUnassigned) return result; - Object temp; - if (null != (temp = result.get(UnicodeProperty.SAMPLE_UNASSIGNED))) { - result.putAll(UnicodeProperty.UNASSIGNED, temp); + if (uniformUnassigned && null != (temp = result.get(UnicodeProperty.getSAMPLE_UNASSIGNED()))) { + result.putAll(UnicodeProperty.getUNASSIGNED(), temp); } - if (null != (temp = result.get(UnicodeProperty.SAMPLE_PRIVATE_USE))) { - result.putAll(UnicodeProperty.PRIVATE_USE, temp); + + if (null != (temp = result.get(UnicodeProperty.SAMPLE_HIGH_SURROGATE))) { + result.putAll(UnicodeProperty.HIGH_SURROGATES, temp); } - if (null != (temp = result.get(UnicodeProperty.SAMPLE_SURROGATE))) { - result.putAll(UnicodeProperty.SURROGATE, temp); + if (null != (temp = result.get(UnicodeProperty.SAMPLE_HIGH_PRIVATE_USE_SURROGATE))) { + result.putAll(UnicodeProperty.HIGH_PRIVATE_USE_SURROGATES, temp); + } + if (null != (temp = result.get(UnicodeProperty.SAMPLE_LOW_SURROGATE))) { + result.putAll(UnicodeProperty.LOW_SURROGATES, temp); + } + + if (null != (temp = result.get(UnicodeProperty.SAMPLE_PRIVATE_USE_AREA))) { + result.putAll(UnicodeProperty.PRIVATE_USE_AREA, temp); + } + if (null != (temp = result.get(UnicodeProperty.SAMPLE_PRIVATE_USE_AREA_A))) { + result.putAll(UnicodeProperty.PRIVATE_USE_AREA_A, temp); + } + if (null != (temp = result.get(UnicodeProperty.SAMPLE_PRIVATE_USE_AREA_B))) { + result.putAll(UnicodeProperty.PRIVATE_USE_AREA_B, temp); } return result; } @@ -1363,7 +1468,7 @@ public abstract class UnicodeProperty extends UnicodeLabel { if (isType(STRING_OR_MISC_MASK)) { return equals(cp, value); } - String defaultValue = getValue(SAMPLE_UNASSIGNED); + String defaultValue = getValue(getSAMPLE_UNASSIGNED()); return defaultValue == null ? value == null : defaultValue.equals(value); } @@ -1374,5 +1479,53 @@ public abstract class UnicodeProperty extends UnicodeLabel { this.hasUniformUnassigned = hasUniformUnassigned; return this; } + + public static class UnicodeSetProperty extends BaseProperty { + protected UnicodeSet unicodeSet; + private static final String[] YESNO_ARRAY = new String[]{"Yes", "No"}; + private static final List YESNO = Arrays.asList(YESNO_ARRAY); + + public UnicodeSetProperty set(UnicodeSet set) { + unicodeSet = set.freeze(); + return this; + } + + public UnicodeSetProperty set(String string) { + // TODO Auto-generated method stub + return set(new UnicodeSet(string).freeze()); + } + + protected String _getValue(int codepoint) { + return YESNO_ARRAY[unicodeSet.contains(codepoint) ? 0 : 1]; + } + + protected List _getAvailableValues(List result) { + return YESNO; + } + } + + private static class StringTransformProperty extends SimpleProperty { + Transform transform; + + public StringTransformProperty(Transform transform, boolean hasUniformUnassigned) { + this.transform = transform; + setUniformUnassigned(hasUniformUnassigned); + } + protected String _getValue(int codepoint) { + return transform.transform(UTF16.valueOf(codepoint)); + } + } + + private static class CodepointTransformProperty extends SimpleProperty { + Transform transform; + + public CodepointTransformProperty(Transform transform, boolean hasUniformUnassigned) { + this.transform = transform; + setUniformUnassigned(hasUniformUnassigned); + } + protected String _getValue(int codepoint) { + return transform.transform(codepoint); + } + } } diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodePropertySymbolTable.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodePropertySymbolTable.java new file mode 100644 index 00000000000..5e39d3889c4 --- /dev/null +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodePropertySymbolTable.java @@ -0,0 +1 @@ +/* ******************************************************************************* * Copyright (C) 1996-2011, Google, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.util; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.Set; import com.ibm.icu.dev.test.util.UnicodeProperty.PatternMatcher; import com.ibm.icu.impl.UnicodeRegex; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; /** * Allows for overriding the parsing of UnicodeSet property patterns. *

* WARNING: If this UnicodePropertySymbolTable is used with {@code UnicodeSet.setDefaultXSymbolTable}, and the * Unassigned characters (gc=Cn) are different than in ICU other than in ICU, you MUST call * {@code UnicodeProperty.ResetCacheProperties} afterwards. If you then call {@code UnicodeSet.setDefaultXSymbolTable} * with null to clear the value, you MUST also call {@code UnicodeProperty.ResetCacheProperties}. * * @author markdavis */ public class UnicodePropertySymbolTable extends UnicodeSet.XSymbolTable { UnicodeRegex unicodeRegex; final UnicodeProperty.Factory factory; public UnicodePropertySymbolTable(UnicodeProperty.Factory factory) { unicodeRegex = new UnicodeRegex().setSymbolTable(this); this.factory = factory; } // public boolean applyPropertyAlias0(String propertyName, // String propertyValue, UnicodeSet result) { // if (!propertyName.contains("*")) { // return applyPropertyAlias(propertyName, propertyValue, result); // } // String[] propertyNames = propertyName.split("[*]"); // for (int i = propertyNames.length - 1; i >= 0; ++i) { // String pname = propertyNames[i]; // // } // return null; // } public boolean applyPropertyAlias(String propertyName, String propertyValue, UnicodeSet result) { boolean status = false; boolean invert = false; int posNotEqual = propertyName.indexOf('\u2260'); int posColon = propertyName.indexOf(':'); if (posNotEqual >= 0 || posColon >= 0) { if (posNotEqual < 0) posNotEqual = propertyName.length(); if (posColon < 0) posColon = propertyName.length(); int opPos = posNotEqual < posColon ? posNotEqual : posColon; propertyValue = propertyValue.length() == 0 ? propertyName.substring(opPos+1) : propertyName.substring(opPos+1) + "=" + propertyValue; propertyName = propertyName.substring(0,opPos); if (posNotEqual < posColon) { invert = true; } } if (propertyName.endsWith("!")) { propertyName = propertyName.substring(0, propertyName.length() - 1); invert = !invert; } propertyValue = propertyValue.trim(); if (propertyValue.length() != 0) { status = applyPropertyAlias0(propertyName, propertyValue, result); } else { try { status = applyPropertyAlias0("gc", propertyName, result); } catch (Exception e) {}; if (!status) { try { status = applyPropertyAlias0("sc", propertyName, result); } catch (Exception e) {}; if (!status) { try { status = applyPropertyAlias0(propertyName, "Yes", result); } catch (Exception e) {}; if (!status) { status = applyPropertyAlias0(propertyName, "", result); } } } } if (status && invert) { result.complement(); } return status; } static final HashMap GC_REMAP = new HashMap(); { GC_REMAP.put("c", "Cc Cf Cn Co Cs".split(" ")); GC_REMAP.put("other", GC_REMAP.get("c")); GC_REMAP.put("l", "Ll Lm Lo Lt Lu".split(" ")); GC_REMAP.put("letter", GC_REMAP.get("l")); GC_REMAP.put("lc", "Ll Lt Lu".split(" ")); GC_REMAP.put("casedletter", GC_REMAP.get("lc")); GC_REMAP.put("m", "Mc Me Mn".split(" ")); GC_REMAP.put("mark", GC_REMAP.get("m")); GC_REMAP.put("n", "Nd Nl No".split(" ")); GC_REMAP.put("number", GC_REMAP.get("n")); GC_REMAP.put("p", "Pc Pd Pe Pf Pi Po Ps".split(" ")); GC_REMAP.put("punctuation", GC_REMAP.get("p")); GC_REMAP.put("punct", GC_REMAP.get("p")); GC_REMAP.put("s", "Sc Sk Sm So".split(" ")); GC_REMAP.put("symbol", GC_REMAP.get("s")); GC_REMAP.put("z", "Zl Zp Zs".split(" ")); GC_REMAP.put("separator", GC_REMAP.get("z")); } public boolean applyPropertyAlias0(String propertyName, String propertyValue, UnicodeSet result) { result.clear(); UnicodeProperty prop = factory.getProperty(propertyName); String canonicalName = prop.getName(); boolean isAge = UnicodeProperty.equalNames("Age", canonicalName); // Hack for special GC values if (canonicalName.equals("General_Category")) { String[] parts = GC_REMAP.get(UnicodeProperty.toSkeleton(propertyValue)); if (parts != null) { for (String part : parts) { prop.getSet(part, result); } return true; } } PatternMatcher patternMatcher = null; if (propertyValue.length() > 1 && propertyValue.startsWith("/") && propertyValue.endsWith("/")) { String fixedRegex = unicodeRegex.transform(propertyValue.substring(1, propertyValue.length() - 1)); patternMatcher = new UnicodeProperty.RegexMatcher().set(fixedRegex); } UnicodeProperty otherProperty = null; boolean testCp = false; if (propertyValue.length() > 1 && propertyValue.startsWith("@") && propertyValue.endsWith("@")) { String otherPropName = propertyValue.substring(1, propertyValue.length() - 1).trim(); if ("cp".equalsIgnoreCase(otherPropName)) { testCp = true; } else { otherProperty = factory.getProperty(otherPropName); } } if (prop != null) { UnicodeSet set; if (testCp) { set = new UnicodeSet(); for (int i = 0; i <= 0x10FFFF; ++i) { if (UnicodeProperty.equals(i, prop.getValue(i))) { set.add(i); } } } else if (otherProperty != null) { set = new UnicodeSet(); for (int i = 0; i <= 0x10FFFF; ++i) { String v1 = prop.getValue(i); String v2 = otherProperty.getValue(i); if (UnicodeProperty.equals(v1, v2)) { set.add(i); } } } else if (patternMatcher == null) { if (!isValid(prop, propertyValue)) { throw new IllegalArgumentException("The value '" + propertyValue + "' is illegal. Values for " + propertyName + " must be in " + prop.getAvailableValues() + " or in " + prop.getValueAliases()); } if (isAge) { set = prop.getSet(new ComparisonMatcher(propertyValue, Relation.geq)); } else { set = prop.getSet(propertyValue); } } else if (isAge) { set = new UnicodeSet(); List values = prop.getAvailableValues(); for (String value : values) { if (patternMatcher.matches(value)) { for (String other : values) { if (other.compareTo(value) <= 0) { set.addAll(prop.getSet(other)); } } } } } else { set = prop.getSet(patternMatcher); } result.addAll(set); return true; } throw new IllegalArgumentException("Illegal property: " + propertyName); } private boolean isValid(UnicodeProperty prop, String propertyValue) { // if (prop.getName().equals("General_Category")) { // if (propertyValue) // } return prop.isValidValue(propertyValue); } public enum Relation {less, leq, equal, geq, greater} public static class ComparisonMatcher implements PatternMatcher { Relation relation; static Comparator comparator = new UTF16.StringComparator(true, false,0); String pattern; public ComparisonMatcher(String pattern, Relation comparator) { this.relation = comparator; this.pattern = pattern; } public boolean matches(Object value) { int comp = comparator.compare(pattern, value.toString()); switch (relation) { case less: return comp < 0; case leq: return comp <= 0; default: return comp == 0; case geq: return comp >= 0; case greater: return comp > 0; } } public PatternMatcher set(String pattern) { this.pattern = pattern; return this; } } } \ No newline at end of file diff --git a/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeTransform.java b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeTransform.java new file mode 100644 index 00000000000..69359ecd03d --- /dev/null +++ b/icu4j/main/tests/translit/src/com/ibm/icu/dev/test/util/UnicodeTransform.java @@ -0,0 +1 @@ +/* ******************************************************************************* * Copyright (C) 2011, Google, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test.util; import com.ibm.icu.text.Transform; import com.ibm.icu.text.UTF16; /** * Simple wrapping for normalizer that allows for both the standard ICU normalizer, and one built directly from the UCD. */ public abstract class UnicodeTransform implements Transform { public enum Type { NFD, NFC, NFKD, NFKC, CASEFOLD } public interface Factory { public UnicodeTransform getInstance(Type type); } private static Factory factory = new IcuUnicodeNormalizerFactory(); public static synchronized Factory getFactory() { return factory; } public static synchronized void setFactory(Factory factory) { UnicodeTransform.factory = factory; } public static synchronized UnicodeTransform getInstance(Type type) { return factory.getInstance(type); } public abstract String transform(String source); /** * Can be overridden for performance. */ public boolean isTransformed(String source) { return source.equals(transform(source)); } /** * Can be overridden for performance. */ public String transform(int source) { return transform(UTF16.valueOf(source)); } /** * Can be overridden for performance. */ public boolean isTransformed(int source) { return isTransformed(UTF16.valueOf(source)); } } \ No newline at end of file