diff --git a/icu4j/src/com/ibm/icu/dev/test/util/CollectionUtilities.java b/icu4j/src/com/ibm/icu/dev/test/util/CollectionUtilities.java index 577927bddc8..8d62bfc7ca9 100644 --- a/icu4j/src/com/ibm/icu/dev/test/util/CollectionUtilities.java +++ b/icu4j/src/com/ibm/icu/dev/test/util/CollectionUtilities.java @@ -14,6 +14,7 @@ import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; +import java.util.regex.Matcher; import com.ibm.icu.impl.UCharacterProperty; import com.ibm.icu.lang.UCharacter; @@ -39,13 +40,23 @@ public final class CollectionUtilities { return target; } - public static Collection addAll(Collection target, Iterator source) { + public static Collection addAll(Iterator source, Collection target) { while (source.hasNext()) { target.add(source.next()); } return target; // for chaining } + public static int size(Iterator source) { + int result = 0; + while (source.hasNext()) { + source.next(); + ++result; + } + return result; + } + + public static Map asMap(Object[][] source) { return asMap(source, new HashMap(), false); } @@ -409,4 +420,84 @@ public final class CollectionUtilities { } } + /** + * Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd] + * Returns the set for chaining. + * @param exemplar1 + * @return + */ + public static UnicodeSet flatten(UnicodeSet exemplar1) { + UnicodeSet result = new UnicodeSet(); + boolean gotString = false; + for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange();) { + if (it.codepoint == it.IS_STRING) { + result.addAll(it.string); + gotString = true; + } else { + result.add(it.codepoint, it.codepointEnd); + } + } + if (gotString) exemplar1.set(result); + return exemplar1; + } + + /** + * For producing filtered iterators + */ + public static abstract class FilteredIterator implements Iterator { + private Iterator baseIterator; + private static final Object EMPTY = new Object(); + private static final Object DONE = new Object(); + private Object nextObject = EMPTY; + public FilteredIterator set(Iterator baseIterator) { + this.baseIterator = baseIterator; + return this; + } + public void remove() { + throw new UnsupportedOperationException("Doesn't support removal"); + } + public Object next() { + Object result = nextObject; + nextObject = EMPTY; + return result; + } + public boolean hasNext() { + if (nextObject == DONE) return false; + if (nextObject != EMPTY) return true; + while (baseIterator.hasNext()) { + nextObject = baseIterator.next(); + if (isIncluded(nextObject)) { + return true; + } + } + nextObject = DONE; + return false; + } + abstract public boolean isIncluded(Object item); + } + + public static class PrefixIterator extends FilteredIterator { + private String prefix; + public PrefixIterator set(Iterator baseIterator, String prefix) { + super.set(baseIterator); + this.prefix = prefix; + return this; + } + public boolean isIncluded(Object item) { + return ((String)item).startsWith(prefix); + } + } + + public static class RegexIterator extends FilteredIterator { + private Matcher matcher; + public RegexIterator set(Iterator baseIterator, Matcher matcher) { + super.set(baseIterator); + this.matcher = matcher; + return this; + } + public boolean isIncluded(Object item) { + return matcher.reset((String)item).matches(); + } + } + } \ No newline at end of file diff --git a/icu4j/src/com/ibm/icu/dev/test/util/Tabber.java b/icu4j/src/com/ibm/icu/dev/test/util/Tabber.java index 1a5cd5446ff..5c1d2efe152 100644 --- a/icu4j/src/com/ibm/icu/dev/test/util/Tabber.java +++ b/icu4j/src/com/ibm/icu/dev/test/util/Tabber.java @@ -1,7 +1,7 @@ /* ******************************************************************************* - * Copyright (C) 2002-2004, International Business Machines Corporation and * + * Copyright (C) 2002-2005, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -151,6 +151,7 @@ public abstract class Tabber { setPostfix(""); } public void setParameters(int count, String params) { + while (count >= parameters.size()) parameters.add(null); parameters.set(count,params); } diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java b/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java index 73a2d6ad8d8..e56e21967c2 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java +++ b/tools/unicodetools/com/ibm/text/UCD/MakeNamesChart.java @@ -406,7 +406,11 @@ public class MakeNamesChart { //String hex = Utility.hex(cp); //return "" + hex + ""; } + int type = Default.ucd().getCategory(cp); + if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) { + return "\u2588"; + } String result = BagFormatter.toHTML.transliterate(UTF16.valueOf(cp)); if (type == UCD.Me || type == UCD.Mn) { result = "\u25CC" + result; diff --git a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java index 6a2a22a1b06..a25eddb2014 100644 --- a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $ -* $Date: 2005/11/01 00:10:54 $ -* $Revision: 1.8 $ +* $Date: 2005/11/08 05:19:59 $ +* $Revision: 1.9 $ * ******************************************************************************* */ @@ -32,8 +32,8 @@ import com.ibm.text.utility.*; public class QuickTest implements UCD_Types { public static void main(String[] args) throws IOException { try { - //getBidiMirrored(); - getCaseFoldingUnstable(); + getBidiMirrored(); + //getCaseFoldingUnstable(); if (true) return; getHasAllNormalizations(); getLengths("NFC", Default.nfc()); @@ -115,61 +115,116 @@ public class QuickTest implements UCD_Types { } } + static UnicodeMap.Composer MyComposer = new UnicodeMap.Composer(){ + public Object compose(int codePoint, Object a, Object b) { + if (a == null) return b; + if (b == null) return a; + return a + "; " + b; + } + }; - - private static void getBidiMirrored() { - ToolUnicodePropertySource foo = ToolUnicodePropertySource.make(""); + static void add(UnicodeMap map, int cp, String s) { + String x = (String) map.getValue(cp); + if (x == null) map.put(cp, s); + else map.put(cp, x + "; " + s); + } + + private static void getBidiMirrored() throws IOException { + //UnicodeMap.Composer composer; + //ToolUnicodePropertySource foo = ToolUnicodePropertySource.make(""); + UnicodeSet proposed = new UnicodeSet("[\u0F3A-\u0F3D\u169B\u169C\u2018-\u201F\u301D-\u301F\uFD3E\uFD3F\uFE59-\uFE5E\uFE64\uFE65\\U0001D6DB\\U0001D715\\U0001D74F\\U0001D789\\U0001D7C3]"); + //UnicodeSet proposed = new UnicodeSet("[\u0F3A-\u0F3D\u169B\u169C\u2018-\u201F\u301D-\u301F\uFD3E\uFD3F\uFE59-\uFE5E\uFE64\uFE65]"); UnicodeMap status = new UnicodeMap(); - status.putAll(foo.getSet("generalcategory=ps"), "*open/close*"); - status.putAll(foo.getSet("generalcategory=pe"), "*open/close*"); - status.putAll(foo.getSet("generalcategory=pi"), "*open/close*"); - status.putAll(foo.getSet("generalcategory=pf"), "*open/close*"); + UCD ucd31 = UCD.make("3.1.0"); + for (int cp = 0; cp < 0x10FFFF; ++cp) { + if (!Default.ucd().isAssigned(cp)) continue; + if (Default.ucd().isPUA(cp)) continue; + + if (proposed.contains(cp)) { + add(status, cp, "***"); + } + + int type = Default.ucd().getCategory(cp); + if (type == UCD.Ps || type == Pe || type == Pi || type == Pf) { + add(status, cp, "Px"); + } + + String s = Default.ucd().getBidiMirror(cp); + if (!s.equals(UTF16.valueOf(cp))) add(status, cp, "bmg"); + + if (ucd31.getBinaryProperty(cp,BidiMirrored)) { + add(status, cp, "bmp3.1"); + } else if (Default.ucd().getBinaryProperty(cp,BidiMirrored)) { + add(status, cp, "bmp5.0"); + } else if (!Default.nfkc().isNormalized(cp)) { + String ss = Default.nfkc().normalize(cp); + if (isBidiMirrored(ss)) { + add(status, cp, "bmp(" + Utility.hex(ss) + ")"); + String name = Default.ucd().getName(cp); + if (name.indexOf("VERTICAL") < 0) proposed.add(cp); + } + + } + + if (type == Sm) { + add(status, cp, "Sm"); + } + else if (Default.ucd().getBinaryProperty(cp,Math_Property)) { + String ss = Default.nfkc().normalize(cp); + if (UTF16.countCodePoint(ss) == 1) { + int cp2 = UTF16.charAt(ss, 0); + int type2 = Default.ucd().getCategory(cp2); + if (type2 == UCD.Lu || type2 == Ll || type2 == Lo || type2 == Nd) { + //System.out.println("Skipping: " + Default.ucd().getCodeAndName(cp)); + } else { + add(status, cp, "S-Math"); + } + } else { + add(status, cp, "S-Math"); + } + } - UnicodeSet bidiMirroredSet = foo.getSet("bidimirrored=true"); - status.putAll(bidiMirroredSet, "*core*"); - UnicodeSet bidiMirroringSet = new UnicodeSet(); - UnicodeProperty x = foo.getProperty("bidimirroringglyph"); - for (int i = 0; i < 0x10FFFF; ++i) { - String s = x.getValue(i); - if (!s.equals(UTF16.valueOf(i))) bidiMirroringSet.add(i); - } - status.putAll(new UnicodeSet(bidiMirroredSet).removeAll(bidiMirroringSet), "no bidi mirroring"); - UnicodeSet mathSet = foo.getSet("generalcategory=sm"); - status.putAll(mathSet, "math"); +// temp = new UnicodeMap(); +// UnicodeSet special = new UnicodeSet("[<>]"); +// for (UnicodeSetIterator it = new UnicodeSetIterator(mathSet); it.next();) { +// String s = Default.nfkd().normalize(it.codepoint); +// if (special.containsSome(s)) temp.put(it.codepoint, "*special*"); +// } +// status.composeWith(temp, MyComposer); - UnicodeSet special = new UnicodeSet("[<>]"); - for (UnicodeSetIterator it = new UnicodeSetIterator(mathSet); it.next();) { - String s = Default.nfkd().normalize(it.codepoint); - if (special.containsSome(s)) status.put(it.codepoint, "*special*"); - } //showStatus(status); // close under nfd - for (int i = 0; i < 0x10FFFF; ++i) { - if (!Default.ucd().isAssigned(i)) continue; - if (!Default.ucd().isPUA(i)) continue; - if (Default.nfkc().isNormalized(i)) continue; - String oldValue = (String) status.getValue(i); - if (oldValue != null) continue; - String s = Default.nfkc().normalize(i); - if (UTF16.countCodePoint(s) != 1) continue; - int cp = UTF16.charAt(s, 0); - String value = (String)status.getValue(cp); - if (value != null) status.put(i, "nfc-closure-" + value); + } - showStatus(status, bidiMirroredSet); + //proposed = status.getSet("Px"); + System.out.println(proposed); + //showStatus(status); + PrintWriter pw = BagFormatter.openUTF8Writer(UCD.GEN_DIR, "bidimirroring_chars.txt"); + showStatus(pw, status); + pw.close(); + } + + private static boolean isBidiMirrored(String ss) { + int cp; + for (int i = 0; i < ss.length(); i += UTF16.getCharCount(cp)) { + cp = UTF16.charAt(ss, i); + if (!Default.ucd().getBinaryProperty(cp,BidiMirrored)) return false; + } + return true; } static BagFormatter bf = new BagFormatter(); - private static void showStatus(UnicodeMap status, UnicodeSet x) { + private static void showStatus(PrintWriter pw, UnicodeMap status) { Collection list = new TreeSet(status.getAvailableValues()); for (Iterator it = list.iterator(); it.hasNext(); ) { String value = (String) it.next(); if (value == null) continue; UnicodeSet set = status.getSet(value); for (UnicodeSetIterator umi = new UnicodeSetIterator(set); umi.next();) { - System.out.println(Utility.hex(umi.codepoint) - + (value.startsWith("*") ? ";\tBidi_Mirrored" : "") - + "\t#\t" + value + pw.println(Utility.hex(umi.codepoint) + //+ (value.startsWith("*") ? ";\tBidi_Mirrored" : "") + + "\t# " + value + + "\t\t( " + UTF16.valueOf(umi.codepoint) + " ) " //+ ";\t" + (x.contains(umi.codepoint) ? "O" : "") + "\t" + Default.ucd().getName(umi.codepoint)); } diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt index 66f001c23a9..cde0301f3fa 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt @@ -137,8 +137,8 @@ Show [$name:«.*LETTER.*» - $alphabetic] # Pattern characters are invariant! # Add after 4.1.0 -#$Pattern_Whitespace = $×Pattern_Whitespace -#$Pattern_Syntax = $×Pattern_Syntax +$Pattern_Whitespace = $×Pattern_Whitespace +$Pattern_Syntax = $×Pattern_Syntax #BIDI invariant constants Let $R_blocks = [$block:Kharoshthi $block:Hebrew $block:Cypriot_Syllabary \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF]