ICU-4700 misc fixes

X-SVN-Rev: 18773
This commit is contained in:
Mark Davis 2005-11-08 05:20:00 +00:00
parent 99d3e09191
commit 6509d8087c
5 changed files with 198 additions and 47 deletions

View file

@ -14,6 +14,7 @@ import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import com.ibm.icu.impl.UCharacterProperty;
import com.ibm.icu.lang.UCharacter;
@ -39,13 +40,23 @@ public final class CollectionUtilities {
return target;
}
public static Collection addAll(Collection target, Iterator source) {
public static Collection addAll(Iterator source, Collection target) {
while (source.hasNext()) {
target.add(source.next());
}
return target; // for chaining
}
public static int size(Iterator source) {
int result = 0;
while (source.hasNext()) {
source.next();
++result;
}
return result;
}
public static Map asMap(Object[][] source) {
return asMap(source, new HashMap(), false);
}
@ -409,4 +420,84 @@ public final class CollectionUtilities {
}
}
/**
* Modifies Unicode set to flatten the strings. Eg [abc{da}] => [abcd]
* Returns the set for chaining.
* @param exemplar1
* @return
*/
public static UnicodeSet flatten(UnicodeSet exemplar1) {
UnicodeSet result = new UnicodeSet();
boolean gotString = false;
for (UnicodeSetIterator it = new UnicodeSetIterator(exemplar1); it.nextRange();) {
if (it.codepoint == it.IS_STRING) {
result.addAll(it.string);
gotString = true;
} else {
result.add(it.codepoint, it.codepointEnd);
}
}
if (gotString) exemplar1.set(result);
return exemplar1;
}
/**
* For producing filtered iterators
*/
public static abstract class FilteredIterator implements Iterator {
private Iterator baseIterator;
private static final Object EMPTY = new Object();
private static final Object DONE = new Object();
private Object nextObject = EMPTY;
public FilteredIterator set(Iterator baseIterator) {
this.baseIterator = baseIterator;
return this;
}
public void remove() {
throw new UnsupportedOperationException("Doesn't support removal");
}
public Object next() {
Object result = nextObject;
nextObject = EMPTY;
return result;
}
public boolean hasNext() {
if (nextObject == DONE) return false;
if (nextObject != EMPTY) return true;
while (baseIterator.hasNext()) {
nextObject = baseIterator.next();
if (isIncluded(nextObject)) {
return true;
}
}
nextObject = DONE;
return false;
}
abstract public boolean isIncluded(Object item);
}
public static class PrefixIterator extends FilteredIterator {
private String prefix;
public PrefixIterator set(Iterator baseIterator, String prefix) {
super.set(baseIterator);
this.prefix = prefix;
return this;
}
public boolean isIncluded(Object item) {
return ((String)item).startsWith(prefix);
}
}
public static class RegexIterator extends FilteredIterator {
private Matcher matcher;
public RegexIterator set(Iterator baseIterator, Matcher matcher) {
super.set(baseIterator);
this.matcher = matcher;
return this;
}
public boolean isIncluded(Object item) {
return matcher.reset((String)item).matches();
}
}
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
* Copyright (C) 2002-2004, International Business Machines Corporation and *
* Copyright (C) 2002-2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -151,6 +151,7 @@ public abstract class Tabber {
setPostfix("</tr>");
}
public void setParameters(int count, String params) {
while (count >= parameters.size()) parameters.add(null);
parameters.set(count,params);
}

View file

@ -406,7 +406,11 @@ public class MakeNamesChart {
//String hex = Utility.hex(cp);
//return "<img alt='" + hex + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + hex + "'>";
}
int type = Default.ucd().getCategory(cp);
if (type == UCD.Cn || type == UCD.Co || type == UCD.Cs) {
return "\u2588";
}
String result = BagFormatter.toHTML.transliterate(UTF16.valueOf(cp));
if (type == UCD.Me || type == UCD.Mn) {
result = "\u25CC" + result;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $
* $Date: 2005/11/01 00:10:54 $
* $Revision: 1.8 $
* $Date: 2005/11/08 05:19:59 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -32,8 +32,8 @@ import com.ibm.text.utility.*;
public class QuickTest implements UCD_Types {
public static void main(String[] args) throws IOException {
try {
//getBidiMirrored();
getCaseFoldingUnstable();
getBidiMirrored();
//getCaseFoldingUnstable();
if (true) return;
getHasAllNormalizations();
getLengths("NFC", Default.nfc());
@ -115,61 +115,116 @@ public class QuickTest implements UCD_Types {
}
}
static UnicodeMap.Composer MyComposer = new UnicodeMap.Composer(){
public Object compose(int codePoint, Object a, Object b) {
if (a == null) return b;
if (b == null) return a;
return a + "; " + b;
}
};
private static void getBidiMirrored() {
ToolUnicodePropertySource foo = ToolUnicodePropertySource.make("");
static void add(UnicodeMap map, int cp, String s) {
String x = (String) map.getValue(cp);
if (x == null) map.put(cp, s);
else map.put(cp, x + "; " + s);
}
private static void getBidiMirrored() throws IOException {
//UnicodeMap.Composer composer;
//ToolUnicodePropertySource foo = ToolUnicodePropertySource.make("");
UnicodeSet proposed = new UnicodeSet("[\u0F3A-\u0F3D\u169B\u169C\u2018-\u201F\u301D-\u301F\uFD3E\uFD3F\uFE59-\uFE5E\uFE64\uFE65\\U0001D6DB\\U0001D715\\U0001D74F\\U0001D789\\U0001D7C3]");
//UnicodeSet proposed = new UnicodeSet("[\u0F3A-\u0F3D\u169B\u169C\u2018-\u201F\u301D-\u301F\uFD3E\uFD3F\uFE59-\uFE5E\uFE64\uFE65]");
UnicodeMap status = new UnicodeMap();
status.putAll(foo.getSet("generalcategory=ps"), "*open/close*");
status.putAll(foo.getSet("generalcategory=pe"), "*open/close*");
status.putAll(foo.getSet("generalcategory=pi"), "*open/close*");
status.putAll(foo.getSet("generalcategory=pf"), "*open/close*");
UCD ucd31 = UCD.make("3.1.0");
for (int cp = 0; cp < 0x10FFFF; ++cp) {
if (!Default.ucd().isAssigned(cp)) continue;
if (Default.ucd().isPUA(cp)) continue;
if (proposed.contains(cp)) {
add(status, cp, "***");
}
int type = Default.ucd().getCategory(cp);
if (type == UCD.Ps || type == Pe || type == Pi || type == Pf) {
add(status, cp, "Px");
}
String s = Default.ucd().getBidiMirror(cp);
if (!s.equals(UTF16.valueOf(cp))) add(status, cp, "bmg");
if (ucd31.getBinaryProperty(cp,BidiMirrored)) {
add(status, cp, "bmp3.1");
} else if (Default.ucd().getBinaryProperty(cp,BidiMirrored)) {
add(status, cp, "bmp5.0");
} else if (!Default.nfkc().isNormalized(cp)) {
String ss = Default.nfkc().normalize(cp);
if (isBidiMirrored(ss)) {
add(status, cp, "bmp(" + Utility.hex(ss) + ")");
String name = Default.ucd().getName(cp);
if (name.indexOf("VERTICAL") < 0) proposed.add(cp);
}
}
if (type == Sm) {
add(status, cp, "Sm");
}
else if (Default.ucd().getBinaryProperty(cp,Math_Property)) {
String ss = Default.nfkc().normalize(cp);
if (UTF16.countCodePoint(ss) == 1) {
int cp2 = UTF16.charAt(ss, 0);
int type2 = Default.ucd().getCategory(cp2);
if (type2 == UCD.Lu || type2 == Ll || type2 == Lo || type2 == Nd) {
//System.out.println("Skipping: " + Default.ucd().getCodeAndName(cp));
} else {
add(status, cp, "S-Math");
}
} else {
add(status, cp, "S-Math");
}
}
UnicodeSet bidiMirroredSet = foo.getSet("bidimirrored=true");
status.putAll(bidiMirroredSet, "*core*");
UnicodeSet bidiMirroringSet = new UnicodeSet();
UnicodeProperty x = foo.getProperty("bidimirroringglyph");
for (int i = 0; i < 0x10FFFF; ++i) {
String s = x.getValue(i);
if (!s.equals(UTF16.valueOf(i))) bidiMirroringSet.add(i);
}
status.putAll(new UnicodeSet(bidiMirroredSet).removeAll(bidiMirroringSet), "no bidi mirroring");
UnicodeSet mathSet = foo.getSet("generalcategory=sm");
status.putAll(mathSet, "math");
// temp = new UnicodeMap();
// UnicodeSet special = new UnicodeSet("[<>]");
// for (UnicodeSetIterator it = new UnicodeSetIterator(mathSet); it.next();) {
// String s = Default.nfkd().normalize(it.codepoint);
// if (special.containsSome(s)) temp.put(it.codepoint, "*special*");
// }
// status.composeWith(temp, MyComposer);
UnicodeSet special = new UnicodeSet("[<>]");
for (UnicodeSetIterator it = new UnicodeSetIterator(mathSet); it.next();) {
String s = Default.nfkd().normalize(it.codepoint);
if (special.containsSome(s)) status.put(it.codepoint, "*special*");
}
//showStatus(status);
// close under nfd
for (int i = 0; i < 0x10FFFF; ++i) {
if (!Default.ucd().isAssigned(i)) continue;
if (!Default.ucd().isPUA(i)) continue;
if (Default.nfkc().isNormalized(i)) continue;
String oldValue = (String) status.getValue(i);
if (oldValue != null) continue;
String s = Default.nfkc().normalize(i);
if (UTF16.countCodePoint(s) != 1) continue;
int cp = UTF16.charAt(s, 0);
String value = (String)status.getValue(cp);
if (value != null) status.put(i, "nfc-closure-" + value);
}
showStatus(status, bidiMirroredSet);
//proposed = status.getSet("Px");
System.out.println(proposed);
//showStatus(status);
PrintWriter pw = BagFormatter.openUTF8Writer(UCD.GEN_DIR, "bidimirroring_chars.txt");
showStatus(pw, status);
pw.close();
}
private static boolean isBidiMirrored(String ss) {
int cp;
for (int i = 0; i < ss.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(ss, i);
if (!Default.ucd().getBinaryProperty(cp,BidiMirrored)) return false;
}
return true;
}
static BagFormatter bf = new BagFormatter();
private static void showStatus(UnicodeMap status, UnicodeSet x) {
private static void showStatus(PrintWriter pw, UnicodeMap status) {
Collection list = new TreeSet(status.getAvailableValues());
for (Iterator it = list.iterator(); it.hasNext(); ) {
String value = (String) it.next();
if (value == null) continue;
UnicodeSet set = status.getSet(value);
for (UnicodeSetIterator umi = new UnicodeSetIterator(set); umi.next();) {
System.out.println(Utility.hex(umi.codepoint)
+ (value.startsWith("*") ? ";\tBidi_Mirrored" : "")
+ "\t#\t" + value
pw.println(Utility.hex(umi.codepoint)
//+ (value.startsWith("*") ? ";\tBidi_Mirrored" : "")
+ "\t# " + value
+ "\t\t( " + UTF16.valueOf(umi.codepoint) + " ) "
//+ ";\t" + (x.contains(umi.codepoint) ? "O" : "")
+ "\t" + Default.ucd().getName(umi.codepoint));
}

View file

@ -137,8 +137,8 @@ Show [$name:«.*LETTER.*» - $alphabetic]
# Pattern characters are invariant!
# Add after 4.1.0
#$Pattern_Whitespace = $×Pattern_Whitespace
#$Pattern_Syntax = $×Pattern_Syntax
$Pattern_Whitespace = $×Pattern_Whitespace
$Pattern_Syntax = $×Pattern_Syntax
#BIDI invariant constants
Let $R_blocks = [$block:Kharoshthi $block:Hebrew $block:Cypriot_Syllabary \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF]