diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java index d2fe6ea2877..1a80b4d4e7e 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java @@ -540,7 +540,7 @@ public class MakeUnicodeFiles { = ToolUnicodePropertySource.make(Default.ucdVersion()); TreeSet sortedSet = new TreeSet(CASELESS_COMPARATOR); BagFormatter bf = new BagFormatter(); - Tabber.MonoTabber mt = new Tabber.MonoTabber() + Tabber.MonoTabber mt = (Tabber.MonoTabber) new Tabber.MonoTabber() .add(10,Tabber.LEFT) .add(30,Tabber.LEFT); int count = 0; @@ -639,7 +639,7 @@ public class MakeUnicodeFiles { // 123456789012345678901234567890123 // sc ; Arab ; Arabic - Tabber.MonoTabber mt2 = new Tabber.MonoTabber() + Tabber.MonoTabber mt2 = (Tabber.MonoTabber) new Tabber.MonoTabber() .add(3,Tabber.LEFT) .add(2,Tabber.LEFT) // ; .add(10,Tabber.LEFT) @@ -649,7 +649,7 @@ public class MakeUnicodeFiles { .add(33,Tabber.LEFT); // ccc; 216; ATAR ; Attached_Above_Right - Tabber.MonoTabber mt3 = new Tabber.MonoTabber() + Tabber.MonoTabber mt3 = (Tabber.MonoTabber) new Tabber.MonoTabber() .add(3,Tabber.LEFT) .add(2,Tabber.LEFT) // ; .add(3,Tabber.RIGHT) diff --git a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java index 5c2fd44574d..b594b1e3102 100644 --- a/tools/unicodetools/com/ibm/text/UCD/QuickTest.java +++ b/tools/unicodetools/com/ibm/text/UCD/QuickTest.java @@ -5,14 +5,35 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $ -* $Date: 2006/09/24 23:32:45 $ -* $Revision: 1.13 $ +* $Date: 2006/11/27 23:15:21 $ +* $Revision: 1.14 $ * ******************************************************************************* */ package com.ibm.text.UCD; +import org.unicode.cldr.util.Counter; + +import com.ibm.icu.dev.demo.translit.CaseIterator; +import com.ibm.icu.dev.test.util.BagFormatter; +import com.ibm.icu.dev.test.util.Tabber; +import com.ibm.icu.dev.test.util.UnicodeMap; +import com.ibm.icu.dev.test.util.UnicodeProperty.UnicodeMapProperty; +import com.ibm.icu.impl.PrettyPrinter; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.CanonicalIterator; +import com.ibm.icu.text.Collator; +import com.ibm.icu.text.NumberFormat; +import com.ibm.icu.text.RuleBasedCollator; +import com.ibm.icu.text.Transliterator; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.text.UnicodeSetIterator; +import com.ibm.icu.util.ULocale; + import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; @@ -33,36 +54,15 @@ import java.util.StringTokenizer; import java.util.TreeMap; import java.util.TreeSet; -import org.unicode.cldr.util.Counter; - -import com.ibm.icu.dev.demo.translit.CaseIterator; -import com.ibm.icu.dev.test.util.BagFormatter; -import com.ibm.icu.dev.test.util.Tabber; -import com.ibm.icu.dev.test.util.UnicodeMap; -import com.ibm.icu.dev.test.util.UnicodeProperty.UnicodeMapProperty; -import com.ibm.icu.impl.PrettyPrinter; -import com.ibm.icu.impl.Utility; -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UProperty; -import com.ibm.icu.text.CanonicalIterator; -import com.ibm.icu.text.Collator; -//import com.ibm.icu.text.Normalizer; - -import com.ibm.icu.text.NumberFormat; -import com.ibm.icu.text.RuleBasedCollator; -import com.ibm.icu.text.Transliterator; -import com.ibm.icu.text.UTF16; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.text.UnicodeSetIterator; -import com.ibm.icu.util.ULocale; - public class QuickTest implements UCD_Types { public static void main(String[] args) throws IOException { try { - - getHangulDecomps(); + String methodName = System.getProperty("method"); + org.unicode.cldr.util.Utility.callMethod(methodName, QuickTest.class); + if (true) return; + getHangulDecomps(); showLeadingTrailingNonStarters(); @@ -203,8 +203,8 @@ public class QuickTest implements UCD_Types { // System.out.println(bf.showSetDifferences("NFC CWP", leadingC, "NFC Trailing", trailingC)); } - - private static void checkCaseChanges() { + + private static void checkCaseChanges() { String first = "3.0.0"; String last = "4.1.0"; UCD ucd30 = UCD.make(first); diff --git a/tools/unicodetools/com/ibm/text/UCD/ScriptTimeline.java b/tools/unicodetools/com/ibm/text/UCD/ScriptTimeline.java new file mode 100644 index 00000000000..4fc70d01c86 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/ScriptTimeline.java @@ -0,0 +1,25 @@ +package com.ibm.text.UCD; + +import com.ibm.icu.dev.test.util.UnicodeProperty; +import com.ibm.icu.lang.UScript; +import com.ibm.icu.text.UnicodeSet; + +import java.util.List; + +public class ScriptTimeline { + public static void main(String[] args) { + String[] versions = { "2.0.0", "2.1.2", "3.0.0", "3.1.0", "3.2.0", "4.0.0", "4.1.0", "5.0.0" }; + for (int s = 0; s < UScript.CODE_LIMIT; ++s) { + String scriptName = UScript.getName(s); + UnicodeSet chars = new UnicodeSet().applyPropertyAlias("script", scriptName); + if (chars.size() == 0) continue; + System.out.print(scriptName); + for (int v = 0; v < versions.length; ++v) { + UnicodeSet age = new UnicodeSet(); + age.applyPropertyAlias("age", versions[v]); + System.out.print("\t" + new UnicodeSet(chars).retainAll(age).size()); + } + System.out.println(); + } + } +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java index 3c4e0897e37..4f241736f95 100644 --- a/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java +++ b/tools/unicodetools/com/ibm/text/UCD/ToolUnicodePropertySource.java @@ -17,693 +17,764 @@ import com.ibm.icu.text.UnicodeSet; import com.ibm.text.utility.Utility; public class ToolUnicodePropertySource extends UnicodeProperty.Factory { - static final boolean DEBUG = false; - private UCD ucd; - private Normalizer nfc, nfd, nfkd, nfkc; + static final boolean DEBUG = false; - private static boolean needAgeCache = true; - private static UCD[] ucdCache = new UCD[UCD_Types.LIMIT_AGE]; - - private static HashMap factoryCache = new HashMap(); - - public static synchronized ToolUnicodePropertySource make(String version) { - ToolUnicodePropertySource result = (ToolUnicodePropertySource)factoryCache.get(version); - if (result != null) return result; - result = new ToolUnicodePropertySource(version); - factoryCache.put(version, result); - return result; - } - - private ToolUnicodePropertySource(String version) { - ucd = UCD.make(version); - nfc = new Normalizer(Normalizer.NFC, ucd.getVersion()); - nfd = new Normalizer(Normalizer.NFD, ucd.getVersion()); - nfkc = new Normalizer(Normalizer.NFKC, ucd.getVersion()); - nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion()); + private UCD ucd; - version = ucd.getVersion(); // regularize + private Normalizer nfc, nfd, nfkd, nfkc; - // first the special cases - if (DEBUG) System.out.println("Adding Simple Cases"); + private static boolean needAgeCache = true; - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; - return ucd.getName(codepoint); - } - }.setValues("") - .setMain("Name", "na", UnicodeProperty.MISC, version)); - - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - if (DEBUG && codepoint == 0x1D100) { - System.out.println("here"); - } - //if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; - return ucd.getBlock(codepoint); - } - protected UnicodeMap _getUnicodeMap() { - return ucd.blockData; - } - }.setValues(ucd.getBlockNames(null)) - .setMain("Block", "blk", UnicodeProperty.CATALOG, version)); - - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - //if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; - return ucd.getBidiMirror(codepoint); - } - }.setValues("") - .setMain("Bidi_Mirroring_Glyph", "bmg", UnicodeProperty.STRING, version)); - - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - //if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; - return ucd.getCase(codepoint,UCD_Types.FULL,UCD_Types.FOLD); - } - }.setValues("") - .setMain("Case_Folding", "cf", UnicodeProperty.STRING, version)); - - add(new UnicodeProperty.SimpleProperty() { - NumberFormat nf = NumberFormat.getInstance(); - { - nf.setGroupingUsed(false); - nf.setMaximumFractionDigits(8); - nf.setMinimumFractionDigits(1); - } - public String _getValue(int codepoint) { - - double num = ucd.getNumericValue(codepoint); - if (Double.isNaN(num)) return null; - return nf.format(num); - } - }.setMain("Numeric_Value", "nv", UnicodeProperty.NUMERIC, version)); - - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int cp) { - if (!ucd.isRepresented(cp)) return null; - String b = nfkc.normalize(ucd.getCase(cp, UCD_Types.FULL, UCD_Types.FOLD)); - String c = nfkc.normalize(ucd.getCase(b, UCD_Types.FULL, UCD_Types.FOLD)); - if (c.equals(b)) return null; - return c; - } - public int getMaxWidth(boolean isShort) { - return 14; - } - }.setMain("FC_NFKC_Closure", "FC_NFKC", UnicodeProperty.STRING, version) - //.addName("FNC") - ); + private static UCD[] ucdCache = new UCD[UCD_Types.LIMIT_AGE]; - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - if (!nfd.isNormalized(codepoint)) return "No"; - else if (nfd.isTrailing(codepoint)) throw new IllegalArgumentException("Internal Error!"); - else return "Yes"; - } - public int getMaxWidth(boolean isShort) { - return 15; - } - }.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases() - .setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version)); - - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - if (!nfc.isNormalized(codepoint)) return "No"; - else if (nfc.isTrailing(codepoint)) return "Maybe"; - else return "Yes"; - } - public int getMaxWidth(boolean isShort) { - return 15; - } - }.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases() - .setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version)); - - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - if (!nfkd.isNormalized(codepoint)) return "No"; - else if (nfkd.isTrailing(codepoint)) throw new IllegalArgumentException("Internal Error!"); - else return "Yes"; - } - public int getMaxWidth(boolean isShort) { - return 15; - } - }.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases() - .setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version)); - - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - if (!nfkc.isNormalized(codepoint)) return "No"; - else if (nfkc.isTrailing(codepoint)) return "Maybe"; - else return "Yes"; - } - public int getMaxWidth(boolean isShort) { - return 15; - } - }.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases() - .setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version)); + private static HashMap factoryCache = new HashMap(); + public static synchronized ToolUnicodePropertySource make(String version) { + ToolUnicodePropertySource result = (ToolUnicodePropertySource) factoryCache.get(version); + if (result != null) + return result; + result = new ToolUnicodePropertySource(version); + factoryCache.put(version, result); + return result; + } + private ToolUnicodePropertySource(String version) { + ucd = UCD.make(version); + nfc = new Normalizer(Normalizer.NFC, ucd.getVersion()); + nfd = new Normalizer(Normalizer.NFD, ucd.getVersion()); + nfkc = new Normalizer(Normalizer.NFKC, ucd.getVersion()); + nfkd = new Normalizer(Normalizer.NFKD, ucd.getVersion()); - - /* - add(new UnicodeProperty.SimpleProperty() { - public String _getValue(int codepoint) { - if (!nfx.isNormalized(codepoint)) return NO; - else if (nfx.isTrailing(codepoint)) return MAYBE; - else return ""; - } - }.setMain("NFD_QuickCheck", "nv", UnicodeProperty.NUMERIC, version) - .setValues("")); - */ - - // Now the derived properties - if (DEBUG) System.out.println("Derived Properties"); - for (int i = 0; i < DerivedProperty.DERIVED_PROPERTY_LIMIT; ++i) { - UCDProperty prop = DerivedProperty.make(i); - if (prop == null) continue; - if (!prop.isStandard()) continue; - String name = prop.getName(); - if (getProperty(name) != null) { - if (DEBUG) System.out.println("Iterated Names: " + name + ", ALREADY PRESENT*"); - continue; // skip if already there - } - int type = prop.getValueType(); - if (i == UCD_Types.FC_NFKC_Closure) type = UnicodeProperty.STRING; - else if (i == UCD_Types.FullCompExclusion) type = UnicodeProperty.BINARY; - else type = remapUCDType(type); - - if (DEBUG) System.out.println(prop.getName()); - add(new UCDPropertyWrapper(prop,type,false)); + version = ucd.getVersion(); // regularize + + // first the special cases + if (DEBUG) + System.out.println("Adding Simple Cases"); + + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) + return null; + return ucd.getName(codepoint); + } + }.setValues("").setMain("Name", "na", UnicodeProperty.MISC, version)); + + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + if (DEBUG && codepoint == 0x1D100) { + System.out.println("here"); } + //if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; + return ucd.getBlock(codepoint); + } - // then the general stuff - - if (DEBUG) System.out.println("Other Properties"); - List names = new ArrayList(); - UnifiedProperty.getAvailablePropertiesAliases(names,ucd); - Iterator it = names.iterator(); - while (it.hasNext()) { - String name = (String) it.next(); - if (getProperty(name) != null) { - if (DEBUG) System.out.println("Iterated Names: " + name + ", ALREADY PRESENT"); - continue; // skip if already there - } - if (DEBUG) System.out.println("Iterated Names: " + name); - add(new ToolUnicodeProperty(name)); - } + protected UnicodeMap _getUnicodeMap() { + return ucd.blockData; + } + }.setValues(ucd.getBlockNames(null)).setMain("Block", "blk", UnicodeProperty.CATALOG, version)); - add(new UnicodeProperty.UnicodeMapProperty() { - { - unicodeMap = new UnicodeMap(); - unicodeMap.setErrorOnReset(true); - unicodeMap.put(0xD, "CR"); - unicodeMap.put(0xA, "LF"); - UnicodeProperty cat = getProperty("General_Category"); - UnicodeSet temp = cat.getSet("Line_Separator") - .addAll(cat.getSet("Paragraph_Separator")) - .addAll(cat.getSet("Control")) - .addAll(cat.getSet("Format")) - .remove(0xD).remove(0xA).remove(0x200C).remove(0x200D); - unicodeMap.putAll(temp, "Control"); - UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); - unicodeMap.putAll(graphemeExtend,"Extend"); - UnicodeProperty hangul = getProperty("Hangul_Syllable_Type"); - unicodeMap.putAll(hangul.getSet("L"),"L"); - unicodeMap.putAll(hangul.getSet("V"),"V"); - unicodeMap.putAll(hangul.getSet("T"),"T"); - unicodeMap.putAll(hangul.getSet("LV"),"LV"); - unicodeMap.putAll(hangul.getSet("LVT"),"LVT"); - unicodeMap.setMissing("Other"); - } - }.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version) - .addValueAliases(new String[][] { - {"Control", "CN"}, - {"Extend", "EX"}, - {"Other", "XX"}, - }, true).swapFirst2ValueAliases()); + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + //if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; + return ucd.getBidiMirror(codepoint); + } + }.setValues("").setMain("Bidi_Mirroring_Glyph", "bmg", UnicodeProperty.STRING, version)); - add(new UnicodeProperty.UnicodeMapProperty() { - { - unicodeMap = new UnicodeMap(); - unicodeMap.setErrorOnReset(true); - UnicodeProperty cat = getProperty("General_Category"); - unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format"); - UnicodeProperty script = getProperty("Script"); - unicodeMap.putAll(script.getSet("Katakana") - .addAll(new UnicodeSet("[\u3031\u3032\u3033\u3034\u3035\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]")) - , "Katakana"); - Object foo = unicodeMap.getSet("Katakana"); - UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); - UnicodeProperty lineBreak = getProperty("Line_Break"); - unicodeMap.putAll(getProperty("Alphabetic").getSet("true") - .add(0x05F3) - .removeAll(getProperty("Ideographic").getSet("true")) - .removeAll(unicodeMap.getSet("Katakana")) - //.removeAll(script.getSet("Thai")) - //.removeAll(script.getSet("Lao")) - .removeAll(lineBreak.getSet("SA")) - .removeAll(script.getSet("Hiragana")) - .removeAll(graphemeExtend), - "ALetter"); - unicodeMap.putAll(new UnicodeSet("[\\u0027\\u00B7\\u05F4\\u2019\\u2027\\u003A]") - ,"MidLetter"); - unicodeMap.putAll(lineBreak.getSet("Infix_Numeric") - .remove(0x003A), "MidNum"); - unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric"); - unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "ExtendNumLet"); - unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it. - unicodeMap.setMissing("Other"); - } - }.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version) - .addValueAliases(new String[][] { - {"Format", "FO"}, - {"Katakana", "KA"}, - {"ALetter", "LE"}, - {"MidLetter", "ML"}, - {"MidNum", "MN"}, - {"Numeric", "NU"}, - {"ExtendNumLet", "EX"}, - {"Other", "XX"}, - }, true).swapFirst2ValueAliases()); + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + //if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; + return ucd.getCase(codepoint, UCD_Types.FULL, UCD_Types.FOLD); + } + }.setValues("").setMain("Case_Folding", "cf", UnicodeProperty.STRING, version)); - add(new UnicodeProperty.UnicodeMapProperty() { - { - unicodeMap = new UnicodeMap(); - unicodeMap.setErrorOnReset(true); - unicodeMap.putAll(new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"), "Sep"); - UnicodeProperty cat = getProperty("General_Category"); - unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format"); - unicodeMap.putAll(getProperty("Whitespace").getSet("true") - .removeAll(unicodeMap.getSet("Sep")) - .remove(0xA0), "Sp"); - UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); - unicodeMap.putAll(getProperty("Lowercase").getSet("true") - .removeAll(graphemeExtend), "Lower"); - unicodeMap.putAll(getProperty("Uppercase").getSet("true") - .addAll(cat.getSet("Titlecase_Letter")) - , "Upper"); - UnicodeSet temp = getProperty("Alphabetic").getSet("true") - .add(0xA0).add(0x5F3) - .removeAll(unicodeMap.getSet("Lower")) - .removeAll(unicodeMap.getSet("Upper")) - .removeAll(graphemeExtend); - unicodeMap.putAll(temp, "OLetter"); - UnicodeProperty lineBreak = getProperty("Line_Break"); - unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric"); - unicodeMap.put(0x002E, "ATerm"); - unicodeMap.putAll(getProperty("STerm").getSet("true") - .removeAll(unicodeMap.getSet("ATerm")), "STerm"); - unicodeMap.putAll(cat.getSet("Open_Punctuation") - .addAll(cat.getSet("Close_Punctuation")) - .addAll(lineBreak.getSet("Quotation")) - .remove(0x05F3) - .removeAll(unicodeMap.getSet("ATerm")) - .removeAll(unicodeMap.getSet("STerm")) - , "Close"); - unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it. - unicodeMap.setMissing("Other"); - } - }.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version) - .addValueAliases(new String[][] { - {"Sep", "SE"}, - {"Format", "FO"}, - {"Sp", "SP"}, - {"Lower", "LO"}, - {"Upper", "UP"}, - {"OLetter", "LE"}, - {"Numeric", "NU"}, - {"ATerm", "AT"}, - {"STerm", "ST"}, - {"Close", "CL"}, - {"Other", "XX"}, - }, false).swapFirst2ValueAliases()); - } - - static String[] YES_NO_MAYBE = {"N", "M", "Y"}; - static String[] LONG_YES_NO_MAYBE = {"No", "Maybe", "Yes"}; + add(new UnicodeProperty.SimpleProperty() { + NumberFormat nf = NumberFormat.getInstance(); + { + nf.setGroupingUsed(false); + nf.setMaximumFractionDigits(8); + nf.setMinimumFractionDigits(1); + } - static String[] YES_NO = {"N", "Y"}; - static String[] LONG_YES_NO = {"No", "Yes"}; + public String _getValue(int codepoint) { + + double num = ucd.getNumericValue(codepoint); + if (Double.isNaN(num)) + return null; + return nf.format(num); + } + }.setMain("Numeric_Value", "nv", UnicodeProperty.NUMERIC, version)); + + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int cp) { + if (!ucd.isRepresented(cp)) + return null; + String b = nfkc.normalize(ucd.getCase(cp, UCD_Types.FULL, UCD_Types.FOLD)); + String c = nfkc.normalize(ucd.getCase(b, UCD_Types.FULL, UCD_Types.FOLD)); + if (c.equals(b)) + return null; + return c; + } + + public int getMaxWidth(boolean isShort) { + return 14; + } + }.setMain("FC_NFKC_Closure", "FC_NFKC", UnicodeProperty.STRING, version) + //.addName("FNC") + ); + + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + if (!nfd.isNormalized(codepoint)) + return "No"; + else if (nfd.isTrailing(codepoint)) + throw new IllegalArgumentException("Internal Error!"); + else + return "Yes"; + } + + public int getMaxWidth(boolean isShort) { + return 15; + } + }.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases().setMain("NFD_Quick_Check", "NFD_QC", UnicodeProperty.ENUMERATED, version)); + + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + if (!nfc.isNormalized(codepoint)) + return "No"; + else if (nfc.isTrailing(codepoint)) + return "Maybe"; + else + return "Yes"; + } + + public int getMaxWidth(boolean isShort) { + return 15; + } + }.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases().setMain("NFC_Quick_Check", "NFC_QC", UnicodeProperty.ENUMERATED, version)); + + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + if (!nfkd.isNormalized(codepoint)) + return "No"; + else if (nfkd.isTrailing(codepoint)) + throw new IllegalArgumentException("Internal Error!"); + else + return "Yes"; + } + + public int getMaxWidth(boolean isShort) { + return 15; + } + }.setValues(LONG_YES_NO, YES_NO).swapFirst2ValueAliases().setMain("NFKD_Quick_Check", "NFKD_QC", UnicodeProperty.ENUMERATED, version)); + + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + if (!nfkc.isNormalized(codepoint)) + return "No"; + else if (nfkc.isTrailing(codepoint)) + return "Maybe"; + else + return "Yes"; + } + + public int getMaxWidth(boolean isShort) { + return 15; + } + }.setValues(LONG_YES_NO_MAYBE, YES_NO_MAYBE).swapFirst2ValueAliases().setMain("NFKC_Quick_Check", "NFKC_QC", UnicodeProperty.ENUMERATED, version)); /* - "Bidi_Mirroring_Glyph", "Block", "Case_Folding", "Case_Sensitive", "ISO_Comment", - "Lowercase_Mapping", "Name", "Numeric_Value", "Simple_Case_Folding", - "Simple_Lowercase_Mapping", "Simple_Titlecase_Mapping", "Simple_Uppercase_Mapping", - "Titlecase_Mapping", "Unicode_1_Name", "Uppercase_Mapping", "isCased", "isCasefolded", - "isLowercase", "isNFC", "isNFD", "isNFKC", "isNFKD", "isTitlecase", "isUppercase", - "toNFC", "toNFD", "toNFKC", "toNKFD" + add(new UnicodeProperty.SimpleProperty() { + public String _getValue(int codepoint) { + if (!nfx.isNormalized(codepoint)) return NO; + else if (nfx.isTrailing(codepoint)) return MAYBE; + else return ""; + } + }.setMain("NFD_QuickCheck", "nv", UnicodeProperty.NUMERIC, version) + .setValues("")); + */ + + // Now the derived properties + if (DEBUG) + System.out.println("Derived Properties"); + for (int i = 0; i < DerivedProperty.DERIVED_PROPERTY_LIMIT; ++i) { + UCDProperty prop = DerivedProperty.make(i); + if (prop == null) + continue; + if (!prop.isStandard()) + continue; + String name = prop.getName(); + if (getProperty(name) != null) { + if (DEBUG) + System.out.println("Iterated Names: " + name + ", ALREADY PRESENT*"); + continue; // skip if already there + } + int type = prop.getValueType(); + if (i == UCD_Types.FC_NFKC_Closure) + type = UnicodeProperty.STRING; + else if (i == UCD_Types.FullCompExclusion) + type = UnicodeProperty.BINARY; + else + type = remapUCDType(type); + + if (DEBUG) + System.out.println(prop.getName()); + add(new UCDPropertyWrapper(prop, type, false)); + } + + // then the general stuff + + if (DEBUG) + System.out.println("Other Properties"); + List names = new ArrayList(); + UnifiedProperty.getAvailablePropertiesAliases(names, ucd); + Iterator it = names.iterator(); + while (it.hasNext()) { + String name = (String) it.next(); + if (getProperty(name) != null) { + if (DEBUG) + System.out.println("Iterated Names: " + name + ", ALREADY PRESENT"); + continue; // skip if already there + } + if (DEBUG) + System.out.println("Iterated Names: " + name); + add(new ToolUnicodeProperty(name)); + } + + int compositeVersion = ucd.getCompositeVersion(); + if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() { + { + unicodeMap = new UnicodeMap(); + unicodeMap.setErrorOnReset(true); + unicodeMap.put(0xD, "CR"); + unicodeMap.put(0xA, "LF"); + UnicodeProperty cat = getProperty("General_Category"); + UnicodeSet temp = cat.getSet("Line_Separator").addAll(cat.getSet("Paragraph_Separator")).addAll(cat.getSet("Control")).addAll(cat.getSet("Format")).remove(0xD).remove(0xA).remove(0x200C) + .remove(0x200D); + unicodeMap.putAll(temp, "Control"); + UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); + unicodeMap.putAll(graphemeExtend, "Extend"); + UnicodeProperty hangul = getProperty("Hangul_Syllable_Type"); + unicodeMap.putAll(hangul.getSet("L"), "L"); + unicodeMap.putAll(hangul.getSet("V"), "V"); + unicodeMap.putAll(hangul.getSet("T"), "T"); + unicodeMap.putAll(hangul.getSet("LV"), "LV"); + unicodeMap.putAll(hangul.getSet("LVT"), "LVT"); + unicodeMap.setMissing("Other"); + } + }.setMain("Grapheme_Cluster_Break", "GCB", UnicodeProperty.ENUMERATED, version).addValueAliases(new String[][] { { "Control", "CN" }, { "Extend", "EX" }, { "Other", "XX" }, }, true) + .swapFirst2ValueAliases()); + + if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() { + { + unicodeMap = new UnicodeMap(); + unicodeMap.setErrorOnReset(true); + UnicodeProperty cat = getProperty("General_Category"); + unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format"); + UnicodeProperty script = getProperty("Script"); + unicodeMap.putAll(script.getSet("Katakana").addAll(new UnicodeSet("[\u3031\u3032\u3033\u3034\u3035\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]")), "Katakana"); + Object foo = unicodeMap.getSet("Katakana"); + UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); + UnicodeProperty lineBreak = getProperty("Line_Break"); + unicodeMap.putAll(getProperty("Alphabetic").getSet("true").add(0x05F3).removeAll(getProperty("Ideographic").getSet("true")).removeAll(unicodeMap.getSet("Katakana")) + //.removeAll(script.getSet("Thai")) + //.removeAll(script.getSet("Lao")) + .removeAll(lineBreak.getSet("SA")).removeAll(script.getSet("Hiragana")).removeAll(graphemeExtend), "ALetter"); + unicodeMap.putAll(new UnicodeSet("[\\u0027\\u00B7\\u05F4\\u2019\\u2027\\u003A]"), "MidLetter"); + unicodeMap.putAll(lineBreak.getSet("Infix_Numeric").remove(0x003A), "MidNum"); + unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric"); + unicodeMap.putAll(cat.getSet("Connector_Punctuation").remove(0x30FB).remove(0xFF65), "ExtendNumLet"); + unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it. + unicodeMap.setMissing("Other"); + } + }.setMain("Word_Break", "WB", UnicodeProperty.ENUMERATED, version).addValueAliases( + new String[][] { { "Format", "FO" }, { "Katakana", "KA" }, { "ALetter", "LE" }, { "MidLetter", "ML" }, { "MidNum", "MN" }, { "Numeric", "NU" }, { "ExtendNumLet", "EX" }, { "Other", "XX" }, }, + true).swapFirst2ValueAliases()); + + if (compositeVersion >= 0x040000) add(new UnicodeProperty.UnicodeMapProperty() { + { + unicodeMap = new UnicodeMap(); + unicodeMap.setErrorOnReset(true); + unicodeMap.putAll(new UnicodeSet("[\\u000A\\u000D\\u0085\\u2028\\u2029]"), "Sep"); + UnicodeProperty cat = getProperty("General_Category"); + unicodeMap.putAll(cat.getSet("Format").remove(0x200C).remove(0x200D), "Format"); + unicodeMap.putAll(getProperty("Whitespace").getSet("true").removeAll(unicodeMap.getSet("Sep")).remove(0xA0), "Sp"); + UnicodeSet graphemeExtend = getProperty("Grapheme_Extend").getSet("true"); + unicodeMap.putAll(getProperty("Lowercase").getSet("true").removeAll(graphemeExtend), "Lower"); + unicodeMap.putAll(getProperty("Uppercase").getSet("true").addAll(cat.getSet("Titlecase_Letter")), "Upper"); + UnicodeSet temp = getProperty("Alphabetic").getSet("true").add(0xA0).add(0x5F3).removeAll(unicodeMap.getSet("Lower")).removeAll(unicodeMap.getSet("Upper")).removeAll(graphemeExtend); + unicodeMap.putAll(temp, "OLetter"); + UnicodeProperty lineBreak = getProperty("Line_Break"); + unicodeMap.putAll(lineBreak.getSet("Numeric"), "Numeric"); + unicodeMap.put(0x002E, "ATerm"); + unicodeMap.putAll(getProperty("STerm").getSet("true").removeAll(unicodeMap.getSet("ATerm")), "STerm"); + unicodeMap.putAll(cat.getSet("Open_Punctuation").addAll(cat.getSet("Close_Punctuation")).addAll(lineBreak.getSet("Quotation")).remove(0x05F3).removeAll(unicodeMap.getSet("ATerm")).removeAll( + unicodeMap.getSet("STerm")), "Close"); + unicodeMap.putAll(graphemeExtend, "Other"); // to verify that none of the above touch it. + unicodeMap.setMissing("Other"); + } + }.setMain("Sentence_Break", "SB", UnicodeProperty.ENUMERATED, version).addValueAliases( + new String[][] { { "Sep", "SE" }, { "Format", "FO" }, { "Sp", "SP" }, { "Lower", "LO" }, { "Upper", "UP" }, { "OLetter", "LE" }, { "Numeric", "NU" }, { "ATerm", "AT" }, { "STerm", "ST" }, + { "Close", "CL" }, { "Other", "XX" }, }, false).swapFirst2ValueAliases()); + } + + static String[] YES_NO_MAYBE = { "N", "M", "Y" }; + + static String[] LONG_YES_NO_MAYBE = { "No", "Maybe", "Yes" }; + + static String[] YES_NO = { "N", "Y" }; + + static String[] LONG_YES_NO = { "No", "Yes" }; + + /* + "Bidi_Mirroring_Glyph", "Block", "Case_Folding", "Case_Sensitive", "ISO_Comment", + "Lowercase_Mapping", "Name", "Numeric_Value", "Simple_Case_Folding", + "Simple_Lowercase_Mapping", "Simple_Titlecase_Mapping", "Simple_Uppercase_Mapping", + "Titlecase_Mapping", "Unicode_1_Name", "Uppercase_Mapping", "isCased", "isCasefolded", + "isLowercase", "isNFC", "isNFD", "isNFKC", "isNFKD", "isTitlecase", "isUppercase", + "toNFC", "toNFD", "toNFKC", "toNKFD" }); */ - - /* - private class NameProperty extends UnicodeProperty.SimpleProperty { - {set("Name", "na", "", UnicodeProperty.STRING);} - public String getPropertyValue(int codepoint) { - if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; - return ucd.getName(codepoint); - } + + /* + private class NameProperty extends UnicodeProperty.SimpleProperty { + {set("Name", "na", "", UnicodeProperty.STRING);} + public String getPropertyValue(int codepoint) { + if ((ODD_BALLS & ucd.getCategoryMask(codepoint)) != 0) return null; + return ucd.getName(codepoint); + } + } + */ + + static class UCDPropertyWrapper extends UnicodeProperty { + UCDProperty ucdProperty; + + boolean yes_no_maybe; + + UCDPropertyWrapper(UCDProperty ucdProperty, int type, boolean yes_no_maybe) { + this.ucdProperty = ucdProperty; + setType(type); + String name = ucdProperty.getName(UCDProperty.LONG); + if (name == null) + ucdProperty.getName(UCDProperty.SHORT); + setName(name); + this.yes_no_maybe = yes_no_maybe; } - */ - - static class UCDPropertyWrapper extends UnicodeProperty { - UCDProperty ucdProperty; - boolean yes_no_maybe; - - UCDPropertyWrapper(UCDProperty ucdProperty, int type, boolean yes_no_maybe) { - this.ucdProperty = ucdProperty; - setType(type); - String name = ucdProperty.getName(UCDProperty.LONG); - if (name == null) ucdProperty.getName(UCDProperty.SHORT); - setName(name); - this.yes_no_maybe = yes_no_maybe; - } - protected String _getVersion() { - return ucdProperty.getUCD().getVersion(); - } - protected String _getValue(int codepoint) { - return ucdProperty.getValue(codepoint, UCDProperty.LONG); - } - protected List _getNameAliases(List result) { - addUnique(ucdProperty.getName(UCDProperty.SHORT), result); - String name = getName(); - addUnique(name, result); - if (name.equals("White_Space")) addUnique("space", result); - return result; - } - protected List _getValueAliases(String valueAlias, List result) { - if (isType(BINARY_MASK)) { - if (valueAlias.equals("True")) addUnique("T", result); - else if (valueAlias.equals("False")) addUnique("F", result); - addUnique(valueAlias, result); - } - if (yes_no_maybe) { - if (valueAlias.equals("Yes")) addUnique("Y", result); - else if (valueAlias.equals("No")) addUnique("N", result); - else if (valueAlias.equals("Maybe")) addUnique("M", result); - addUnique(valueAlias, result); - } - return result; - } - protected List _getAvailableValues(List result) { - if (isType(BINARY_MASK)) { - addUnique("True", result); - addUnique("False", result); - } - if (yes_no_maybe) { - addUnique("No",result); - addUnique("Maybe",result); - addUnique("Yes",result); - } - return result; - } + + protected String _getVersion() { + return ucdProperty.getUCD().getVersion(); } - static final int ODD_BALLS = (1<"); - else if (type == NUMERIC) result.add(""); - else if (type == BINARY) { - result.add("True"); - result.add("False"); - } else if (type == ENUMERATED || type == CATALOG) { - byte style = UCD_Types.LONG; - int prop = propMask>>8; - String temp = null; - boolean titlecase = false; - for (int i = 0; i < 256; ++i) { - boolean check = false; - try { - switch (prop) { - case UCD_Types.CATEGORY>>8: temp = (ucd.getCategoryID_fromIndex((byte)i, style)); break; - case UCD_Types.COMBINING_CLASS>>8: temp = (ucd.getCombiningClassID_fromIndex((short)i, style)); break; - case UCD_Types.BIDI_CLASS>>8: temp = (ucd.getBidiClassID_fromIndex((byte)i, style)); break; - case UCD_Types.DECOMPOSITION_TYPE>>8: temp = (ucd.getDecompositionTypeID_fromIndex((byte)i, style)); - //check = temp != null; - break; - case UCD_Types.NUMERIC_TYPE>>8: temp = (ucd.getNumericTypeID_fromIndex((byte)i, style)); - titlecase = true; - break; - case UCD_Types.EAST_ASIAN_WIDTH>>8: temp = (ucd.getEastAsianWidthID_fromIndex((byte)i, style)); break; - case UCD_Types.LINE_BREAK>>8: temp = (ucd.getLineBreakID_fromIndex((byte)i, style)); break; - case UCD_Types.JOINING_TYPE>>8: temp = (ucd.getJoiningTypeID_fromIndex((byte)i, style)); break; - case UCD_Types.JOINING_GROUP>>8: temp = (ucd.getJoiningGroupID_fromIndex((byte)i, style)); break; - case UCD_Types.SCRIPT>>8: - temp = (ucd.getScriptID_fromIndex((byte)i, style)); titlecase = true; - if (UnicodeProperty.UNUSED.equals(temp)) continue; - if (temp != null) temp = UCharacter.toTitleCase(Locale.ENGLISH,temp,null); - break; - case UCD_Types.AGE>>8: temp = (ucd.getAgeID_fromIndex((byte)i, style)); break; - case UCD_Types.HANGUL_SYLLABLE_TYPE>>8: - temp = (ucd.getHangulSyllableTypeID_fromIndex((byte)i,style)); break; - default: throw new IllegalArgumentException("Internal Error: " + prop); - } - } catch (ArrayIndexOutOfBoundsException e) { - continue; - } - if (check) System.out.println("Value: " + temp); - if (temp != null && temp.length() != 0 && !temp.equals(UNUSED)) { - result.add(Utility.getUnskeleton(temp, titlecase)); - } - if (check) System.out.println("Value2: " + temp); - } - //if (prop == (UCD_Types.DECOMPOSITION_TYPE>>8)) result.add("none"); - //if (prop == (UCD_Types.JOINING_TYPE>>8)) result.add("Non_Joining"); - //if (prop == (UCD_Types.NUMERIC_TYPE>>8)) result.add("None"); - } - return result; - } + protected String _getValue(int codepoint) { + String result = ucdProperty.getValue(codepoint, UCDProperty.LONG); + if (result.length() == 0) { + return "False"; + } + return result; + } - public List _getNameAliases(List result) { - if (result == null) result = new ArrayList(); - addUnique(Utility.getUnskeleton(up.getName(UCD_Types.SHORT), false), result); - String longName = up.getName(UCD_Types.LONG); - addUnique(Utility.getUnskeleton(longName, true), result); - // hack - if (longName.equals("White_Space")) addUnique("space", result); - return result; - } - - public List _getValueAliases(String valueAlias, List result) { - if (result == null) result = new ArrayList(); - int type = getType() & CORE_MASK; - if (type == STRING || type == MISC || type == NUMERIC) { - UnicodeProperty.addUnique(valueAlias, result); - return result; - } else if (type == BINARY) { - UnicodeProperty.addUnique(valueAlias, result); - return lookup(valueAlias, UCD_Names.YN_TABLE_LONG, UCD_Names.YN_TABLE, null, result); - } else if (type == ENUMERATED || type == CATALOG) { - byte style = UCD_Types.LONG; - int prop = propMask>>8; - boolean titlecase = false; - for (int i = 0; i < 256; ++i) { - try { - switch (prop) { - case UCD_Types.CATEGORY>>8: - return lookup(valueAlias, UCD_Names.LONG_GENERAL_CATEGORY, UCD_Names.GENERAL_CATEGORY, UCD_Names.EXTRA_GENERAL_CATEGORY, result); - case UCD_Types.COMBINING_CLASS>>8: - addUnique(String.valueOf(0xFF&Utility.lookup(valueAlias, UCD_Names.LONG_COMBINING_CLASS, true)), result); - return lookup(valueAlias, UCD_Names.LONG_COMBINING_CLASS, UCD_Names.COMBINING_CLASS, null, result); - case UCD_Types.BIDI_CLASS>>8: - return lookup(valueAlias, UCD_Names.LONG_BIDI_CLASS, UCD_Names.BIDI_CLASS, null, result); - case UCD_Types.DECOMPOSITION_TYPE>>8: - return lookup(valueAlias, UCD_Names.LONG_DECOMPOSITION_TYPE, UCD_Names.DECOMPOSITION_TYPE, null, result); - case UCD_Types.NUMERIC_TYPE>>8: - return lookup(valueAlias, UCD_Names.LONG_NUMERIC_TYPE, UCD_Names.NUMERIC_TYPE, null, result); - case UCD_Types.EAST_ASIAN_WIDTH>>8: - return lookup(valueAlias, UCD_Names.LONG_EAST_ASIAN_WIDTH, UCD_Names.EAST_ASIAN_WIDTH, null, result); - case UCD_Types.LINE_BREAK>>8: - lookup(valueAlias, UCD_Names.LONG_LINE_BREAK, UCD_Names.LINE_BREAK, null, result); - if (valueAlias.equals("Inseparable")) addUnique("Inseperable", result); - // Inseparable; Inseperable - return result; - case UCD_Types.JOINING_TYPE>>8: - return lookup(valueAlias, UCD_Names.LONG_JOINING_TYPE, UCD_Names.JOINING_TYPE, null, result); - case UCD_Types.JOINING_GROUP>>8: - return lookup(valueAlias, UCD_Names.JOINING_GROUP, null, null, result); - case UCD_Types.SCRIPT>>8: - return lookup(valueAlias, UCD_Names.LONG_SCRIPT, UCD_Names.SCRIPT, UCD_Names.EXTRA_SCRIPT, result); - case UCD_Types.AGE>>8: - return lookup(valueAlias, UCD_Names.AGE, null, null, result); - case UCD_Types.HANGUL_SYLLABLE_TYPE>>8: - return lookup(valueAlias, UCD_Names.LONG_HANGUL_SYLLABLE_TYPE, UCD_Names.HANGUL_SYLLABLE_TYPE, null, result); - default: throw new IllegalArgumentException("Internal Error: " + prop); - } - } catch (ArrayIndexOutOfBoundsException e) { - continue; - } - } - } - throw new ArrayIndexOutOfBoundsException("not supported yet"); - } + protected List _getNameAliases(List result) { + addUnique(ucdProperty.getName(UCDProperty.SHORT), result); + String name = getName(); + addUnique(name, result); + if (name.equals("White_Space")) + addUnique("space", result); + return result; + } - public String _getValue(int codepoint) { - byte style = UCD_Types.LONG; - String temp = null; - boolean titlecase = false; - switch (propMask>>8) { - case UCD_Types.CATEGORY>>8: temp = (ucd.getCategoryID_fromIndex(ucd.getCategory(codepoint), style)); break; - case UCD_Types.COMBINING_CLASS>>8: temp = (ucd.getCombiningClassID_fromIndex(ucd.getCombiningClass(codepoint), style)); - //if (temp.startsWith("Fixed_")) temp = temp.substring(6); + protected List _getValueAliases(String valueAlias, List result) { + if (isType(BINARY_MASK)) { + if (valueAlias.equals("True")) + addUnique("T", result); + else if (valueAlias.equals("False")) + addUnique("F", result); + addUnique(valueAlias, result); + } + if (yes_no_maybe) { + if (valueAlias.equals("Yes")) + addUnique("Y", result); + else if (valueAlias.equals("No")) + addUnique("N", result); + else if (valueAlias.equals("Maybe")) + addUnique("M", result); + addUnique(valueAlias, result); + } + return result; + } + + protected List _getAvailableValues(List result) { + if (isType(BINARY_MASK)) { + addUnique("True", result); + addUnique("False", result); + } + if (yes_no_maybe) { + addUnique("No", result); + addUnique("Maybe", result); + addUnique("Yes", result); + } + return result; + } + } + + static final int ODD_BALLS = (1 << UCD_Types.Cn) | (1 << UCD_Types.Co) | (1 << UCD_Types.Cs) | (1 << UCD.Cc); + + /* (non-Javadoc) + * @see com.ibm.icu.dev.test.util.UnicodePropertySource#getPropertyAliases(java.util.Collection) + */ + private class ToolUnicodeProperty extends UnicodeProperty { + com.ibm.text.UCD.UCDProperty up; + + int propMask; + + static final int EXTRA_START = 0x10000; + + private ToolUnicodeProperty(String propertyAlias) { + propMask = UnifiedProperty.getPropmask(propertyAlias, ucd); + up = UnifiedProperty.make(propMask, ucd); + if (up == null) + throw new IllegalArgumentException("Not found: " + propertyAlias); + if (propertyAlias.equals("Case_Fold_Turkish_I")) { + System.out.println(propertyAlias + " " + getTypeName(getType())); + } + setType(getPropertyTypeInternal()); + setName(propertyAlias); + } + + public List _getAvailableValues(List result) { + if (result == null) + result = new ArrayList(); + int type = getType() & CORE_MASK; + if (type == STRING || type == MISC) + result.add(""); + else if (type == NUMERIC) + result.add(""); + else if (type == BINARY) { + result.add("True"); + result.add("False"); + } else if (type == ENUMERATED || type == CATALOG) { + byte style = UCD_Types.LONG; + int prop = propMask >> 8; + String temp = null; + boolean titlecase = false; + for (int i = 0; i < 256; ++i) { + boolean check = false; + try { + switch (prop) { + case UCD_Types.CATEGORY >> 8: + temp = (ucd.getCategoryID_fromIndex((byte) i, style)); break; - case UCD_Types.BIDI_CLASS>>8: temp = (ucd.getBidiClassID_fromIndex(ucd.getBidiClass(codepoint), style)); break; - case UCD_Types.DECOMPOSITION_TYPE>>8: temp = (ucd.getDecompositionTypeID_fromIndex(ucd.getDecompositionType(codepoint), style)); - if (temp == null || temp.length() == 0) temp = "none"; + case UCD_Types.COMBINING_CLASS >> 8: + temp = (ucd.getCombiningClassID_fromIndex((short) i, style)); break; - case UCD_Types.NUMERIC_TYPE>>8: temp = (ucd.getNumericTypeID_fromIndex(ucd.getNumericType(codepoint), style)); - titlecase = true; - if (temp == null || temp.length() == 0) temp = "None"; + case UCD_Types.BIDI_CLASS >> 8: + temp = (ucd.getBidiClassID_fromIndex((byte) i, style)); break; - case UCD_Types.EAST_ASIAN_WIDTH>>8: temp = (ucd.getEastAsianWidthID_fromIndex(ucd.getEastAsianWidth(codepoint), style)); break; - case UCD_Types.LINE_BREAK>>8: temp = (ucd.getLineBreakID_fromIndex(ucd.getLineBreak(codepoint), style)); break; - case UCD_Types.JOINING_TYPE>>8: temp = (ucd.getJoiningTypeID_fromIndex(ucd.getJoiningType(codepoint), style)); - if (temp == null || temp.length() == 0) temp = "Non_Joining"; + case UCD_Types.DECOMPOSITION_TYPE >> 8: + temp = (ucd.getDecompositionTypeID_fromIndex((byte) i, style)); + //check = temp != null; break; - case UCD_Types.JOINING_GROUP>>8: temp = (ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(codepoint), style)); break; - case UCD_Types.SCRIPT>>8: temp = (ucd.getScriptID_fromIndex(ucd.getScript(codepoint), style)); - if (temp != null) temp = UCharacter.toTitleCase(Locale.ENGLISH,temp,null); + case UCD_Types.NUMERIC_TYPE >> 8: + temp = (ucd.getNumericTypeID_fromIndex((byte) i, style)); titlecase = true; break; - case UCD_Types.AGE>>8: temp = getAge(codepoint); break; - case UCD_Types.HANGUL_SYLLABLE_TYPE>>8: - temp = (ucd.getHangulSyllableTypeID_fromIndex(ucd.getHangulSyllableType(codepoint),style)); break; + case UCD_Types.EAST_ASIAN_WIDTH >> 8: + temp = (ucd.getEastAsianWidthID_fromIndex((byte) i, style)); + break; + case UCD_Types.LINE_BREAK >> 8: + temp = (ucd.getLineBreakID_fromIndex((byte) i, style)); + break; + case UCD_Types.JOINING_TYPE >> 8: + temp = (ucd.getJoiningTypeID_fromIndex((byte) i, style)); + break; + case UCD_Types.JOINING_GROUP >> 8: + temp = (ucd.getJoiningGroupID_fromIndex((byte) i, style)); + break; + case UCD_Types.SCRIPT >> 8: + temp = (ucd.getScriptID_fromIndex((byte) i, style)); + titlecase = true; + if (UnicodeProperty.UNUSED.equals(temp)) + continue; + if (temp != null) + temp = UCharacter.toTitleCase(Locale.ENGLISH, temp, null); + break; + case UCD_Types.AGE >> 8: + temp = (ucd.getAgeID_fromIndex((byte) i, style)); + break; + case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8: + temp = (ucd.getHangulSyllableTypeID_fromIndex((byte) i, style)); + break; + default: + throw new IllegalArgumentException("Internal Error: " + prop); } - if (temp != null) return Utility.getUnskeleton(temp,titlecase); - if (isType(BINARY_MASK)) { - return up.hasValue(codepoint) ? "True" : "False"; - } - throw new IllegalArgumentException("Failed to find value for " + Utility.hex(codepoint)); + } catch (ArrayIndexOutOfBoundsException e) { + continue; + } + if (check) + System.out.println("Value: " + temp); + if (temp != null && temp.length() != 0 && !temp.equals(UNUSED)) { + result.add(Utility.getUnskeleton(temp, titlecase)); + } + if (check) + System.out.println("Value2: " + temp); } - - public String getAge(int codePoint) { - if (codePoint == 0xF0000) { - System.out.println("debug point"); - } - if (needAgeCache) { - for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) { - ucdCache[i] = UCD.make(UCD_Names.AGE_VERSIONS[i]); - } - needAgeCache = false; - } - for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) { - if (ucdCache[i].isAllocated(codePoint)) return UCD_Names.AGE[i]; - } - return UCD_Names.AGE[UCD_Types.UNKNOWN]; - } - - /* (non-Javadoc) - * @see com.ibm.icu.dev.test.util.UnicodePropertySource#getPropertyType() - */ - private int getPropertyTypeInternal() { - - switch(propMask) { - case UCD_Types.BINARY_PROPERTIES | UCD_Types.CaseFoldTurkishI: - case UCD_Types.BINARY_PROPERTIES | UCD_Types.Non_break: - return EXTENDED_BINARY; - } - - switch(propMask>>8) { - case UCD_Types.SCRIPT>>8: - case UCD_Types.AGE>>8: - return CATALOG; - } - int mask = 0; - if (!up.isStandard()) mask = EXTENDED_MASK; - return remapUCDType(up.getValueType()) | mask; - } - - public String _getVersion() { - return up.ucd.getVersion(); - } - - } - - private int remapUCDType(int result) { - switch (result) { - case UCD_Types.NUMERIC_PROP: result = UnicodeProperty.NUMERIC; break; - case UCD_Types.STRING_PROP: result = UnicodeProperty.STRING; break; - case UCD_Types.MISC_PROP: result = UnicodeProperty.STRING; break; - case UCD_Types.CATALOG_PROP: result = UnicodeProperty.ENUMERATED; break; - case UCD_Types.FLATTENED_BINARY_PROP: - case UCD_Types.ENUMERATED_PROP: result = UnicodeProperty.ENUMERATED; break; - case UCD_Types.BINARY_PROP: result = UnicodeProperty.BINARY; break; - case UCD_Types.UNKNOWN_PROP: - default: - result = UnicodeProperty.STRING; - //throw new IllegalArgumentException("Type: UNKNOWN_PROP"); - } - return result; + //if (prop == (UCD_Types.DECOMPOSITION_TYPE>>8)) result.add("none"); + //if (prop == (UCD_Types.JOINING_TYPE>>8)) result.add("Non_Joining"); + //if (prop == (UCD_Types.NUMERIC_TYPE>>8)) result.add("None"); + } + return result; } - static List lookup(String valueAlias, String[] main, String[] aux, Map aux2, List result) { - //System.out.println(valueAlias + "=>"); - //System.out.println("=>" + aux[pos]); - if (aux != null) { - int pos = 0xFF & Utility.lookup(valueAlias, main, true); - UnicodeProperty.addUnique(aux[pos], result); - } + public List _getNameAliases(List result) { + if (result == null) + result = new ArrayList(); + addUnique(Utility.getUnskeleton(up.getName(UCD_Types.SHORT), false), result); + String longName = up.getName(UCD_Types.LONG); + addUnique(Utility.getUnskeleton(longName, true), result); + // hack + if (longName.equals("White_Space")) + addUnique("space", result); + return result; + } + + public List _getValueAliases(String valueAlias, List result) { + if (result == null) + result = new ArrayList(); + int type = getType() & CORE_MASK; + if (type == STRING || type == MISC || type == NUMERIC) { UnicodeProperty.addUnique(valueAlias, result); - if (aux2 != null) { - String xtra = (String) aux2.get(valueAlias); - if (xtra != null) UnicodeProperty.addUnique(xtra, result); - } return result; + } else if (type == BINARY) { + UnicodeProperty.addUnique(valueAlias, result); + return lookup(valueAlias, UCD_Names.YN_TABLE_LONG, UCD_Names.YN_TABLE, null, result); + } else if (type == ENUMERATED || type == CATALOG) { + byte style = UCD_Types.LONG; + int prop = propMask >> 8; + boolean titlecase = false; + for (int i = 0; i < 256; ++i) { + try { + switch (prop) { + case UCD_Types.CATEGORY >> 8: + return lookup(valueAlias, UCD_Names.LONG_GENERAL_CATEGORY, UCD_Names.GENERAL_CATEGORY, UCD_Names.EXTRA_GENERAL_CATEGORY, result); + case UCD_Types.COMBINING_CLASS >> 8: + addUnique(String.valueOf(0xFF & Utility.lookup(valueAlias, UCD_Names.LONG_COMBINING_CLASS, true)), result); + return lookup(valueAlias, UCD_Names.LONG_COMBINING_CLASS, UCD_Names.COMBINING_CLASS, null, result); + case UCD_Types.BIDI_CLASS >> 8: + return lookup(valueAlias, UCD_Names.LONG_BIDI_CLASS, UCD_Names.BIDI_CLASS, null, result); + case UCD_Types.DECOMPOSITION_TYPE >> 8: + return lookup(valueAlias, UCD_Names.LONG_DECOMPOSITION_TYPE, UCD_Names.DECOMPOSITION_TYPE, null, result); + case UCD_Types.NUMERIC_TYPE >> 8: + return lookup(valueAlias, UCD_Names.LONG_NUMERIC_TYPE, UCD_Names.NUMERIC_TYPE, null, result); + case UCD_Types.EAST_ASIAN_WIDTH >> 8: + return lookup(valueAlias, UCD_Names.LONG_EAST_ASIAN_WIDTH, UCD_Names.EAST_ASIAN_WIDTH, null, result); + case UCD_Types.LINE_BREAK >> 8: + lookup(valueAlias, UCD_Names.LONG_LINE_BREAK, UCD_Names.LINE_BREAK, null, result); + if (valueAlias.equals("Inseparable")) + addUnique("Inseperable", result); + // Inseparable; Inseperable + return result; + case UCD_Types.JOINING_TYPE >> 8: + return lookup(valueAlias, UCD_Names.LONG_JOINING_TYPE, UCD_Names.JOINING_TYPE, null, result); + case UCD_Types.JOINING_GROUP >> 8: + return lookup(valueAlias, UCD_Names.JOINING_GROUP, null, null, result); + case UCD_Types.SCRIPT >> 8: + return lookup(valueAlias, UCD_Names.LONG_SCRIPT, UCD_Names.SCRIPT, UCD_Names.EXTRA_SCRIPT, result); + case UCD_Types.AGE >> 8: + return lookup(valueAlias, UCD_Names.AGE, null, null, result); + case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8: + return lookup(valueAlias, UCD_Names.LONG_HANGUL_SYLLABLE_TYPE, UCD_Names.HANGUL_SYLLABLE_TYPE, null, result); + default: + throw new IllegalArgumentException("Internal Error: " + prop); + } + } catch (ArrayIndexOutOfBoundsException e) { + continue; + } + } + } + throw new ArrayIndexOutOfBoundsException("not supported yet"); } - /* - static class DerivedPropertyWrapper extends UnicodeProperty { - UCDProperty derivedProperty; - UCD ucd; - - DerivedPropertyWrapper(int derivedPropertyID, UCD ucd) { - this.ucd = ucd; - derivedProperty = DerivedProperty.make(derivedPropertyID, ucd); - } - protected String _getVersion() { - return ucd.getVersion(); - } - - protected String _getValue(int codepoint) { - return derivedProperty.getValue(codepoint, UCD_Types.LONG); - } - protected List _getNameAliases(List result) { - if (result != null) result = new ArrayList(1); - addUnique(derivedProperty.getName(UCD_Types.SHORT), result); - addUnique(derivedProperty.getName(UCD_Types.LONG), result); - return null; - } - - protected List _getValueAliases(String valueAlias, List result) { - // TODO Auto-generated method stub - return null; - } - protected List _getAvailableValues(List result) { - // TODO Auto-generated method stub - return null; - } - + public String _getValue(int codepoint) { + byte style = UCD_Types.LONG; + String temp = null; + boolean titlecase = false; + switch (propMask >> 8) { + case UCD_Types.CATEGORY >> 8: + temp = (ucd.getCategoryID_fromIndex(ucd.getCategory(codepoint), style)); + break; + case UCD_Types.COMBINING_CLASS >> 8: + temp = (ucd.getCombiningClassID_fromIndex(ucd.getCombiningClass(codepoint), style)); + //if (temp.startsWith("Fixed_")) temp = temp.substring(6); + break; + case UCD_Types.BIDI_CLASS >> 8: + temp = (ucd.getBidiClassID_fromIndex(ucd.getBidiClass(codepoint), style)); + break; + case UCD_Types.DECOMPOSITION_TYPE >> 8: + temp = (ucd.getDecompositionTypeID_fromIndex(ucd.getDecompositionType(codepoint), style)); + if (temp == null || temp.length() == 0) + temp = "none"; + break; + case UCD_Types.NUMERIC_TYPE >> 8: + temp = (ucd.getNumericTypeID_fromIndex(ucd.getNumericType(codepoint), style)); + titlecase = true; + if (temp == null || temp.length() == 0) + temp = "None"; + break; + case UCD_Types.EAST_ASIAN_WIDTH >> 8: + temp = (ucd.getEastAsianWidthID_fromIndex(ucd.getEastAsianWidth(codepoint), style)); + break; + case UCD_Types.LINE_BREAK >> 8: + temp = (ucd.getLineBreakID_fromIndex(ucd.getLineBreak(codepoint), style)); + break; + case UCD_Types.JOINING_TYPE >> 8: + temp = (ucd.getJoiningTypeID_fromIndex(ucd.getJoiningType(codepoint), style)); + if (temp == null || temp.length() == 0) + temp = "Non_Joining"; + break; + case UCD_Types.JOINING_GROUP >> 8: + temp = (ucd.getJoiningGroupID_fromIndex(ucd.getJoiningGroup(codepoint), style)); + break; + case UCD_Types.SCRIPT >> 8: + temp = (ucd.getScriptID_fromIndex(ucd.getScript(codepoint), style)); + if (temp != null) + temp = UCharacter.toTitleCase(Locale.ENGLISH, temp, null); + titlecase = true; + break; + case UCD_Types.AGE >> 8: + temp = getAge(codepoint); + break; + case UCD_Types.HANGUL_SYLLABLE_TYPE >> 8: + temp = (ucd.getHangulSyllableTypeID_fromIndex(ucd.getHangulSyllableType(codepoint), style)); + break; + } + if (temp != null) + return Utility.getUnskeleton(temp, titlecase); + if (isType(BINARY_MASK)) { + return up.hasValue(codepoint) ? "True" : "False"; + } + throw new IllegalArgumentException("Failed to find value for " + Utility.hex(codepoint)); } - */ + + public String getAge(int codePoint) { + if (codePoint == 0xF0000) { + System.out.println("debug point"); + } + if (needAgeCache) { + for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) { + ucdCache[i] = UCD.make(UCD_Names.AGE_VERSIONS[i]); + } + needAgeCache = false; + } + for (int i = UCD_Types.AGE11; i < UCD_Types.LIMIT_AGE; ++i) { + if (ucdCache[i].isAllocated(codePoint)) + return UCD_Names.AGE[i]; + } + return UCD_Names.AGE[UCD_Types.UNKNOWN]; + } + + /* (non-Javadoc) + * @see com.ibm.icu.dev.test.util.UnicodePropertySource#getPropertyType() + */ + private int getPropertyTypeInternal() { + + switch (propMask) { + case UCD_Types.BINARY_PROPERTIES | UCD_Types.CaseFoldTurkishI: + case UCD_Types.BINARY_PROPERTIES | UCD_Types.Non_break: + return EXTENDED_BINARY; + } + + switch (propMask >> 8) { + case UCD_Types.SCRIPT >> 8: + case UCD_Types.AGE >> 8: + return CATALOG; + } + int mask = 0; + if (!up.isStandard()) + mask = EXTENDED_MASK; + return remapUCDType(up.getValueType()) | mask; + } + + public String _getVersion() { + return up.ucd.getVersion(); + } + + } + + private int remapUCDType(int result) { + switch (result) { + case UCD_Types.NUMERIC_PROP: + result = UnicodeProperty.NUMERIC; + break; + case UCD_Types.STRING_PROP: + result = UnicodeProperty.STRING; + break; + case UCD_Types.MISC_PROP: + result = UnicodeProperty.STRING; + break; + case UCD_Types.CATALOG_PROP: + result = UnicodeProperty.ENUMERATED; + break; + case UCD_Types.FLATTENED_BINARY_PROP: + case UCD_Types.ENUMERATED_PROP: + result = UnicodeProperty.ENUMERATED; + break; + case UCD_Types.BINARY_PROP: + result = UnicodeProperty.BINARY; + break; + case UCD_Types.UNKNOWN_PROP: + default: + result = UnicodeProperty.STRING; + //throw new IllegalArgumentException("Type: UNKNOWN_PROP"); + } + return result; + } + + static List lookup(String valueAlias, String[] main, String[] aux, Map aux2, List result) { + //System.out.println(valueAlias + "=>"); + //System.out.println("=>" + aux[pos]); + if (aux != null) { + int pos = 0xFF & Utility.lookup(valueAlias, main, true); + UnicodeProperty.addUnique(aux[pos], result); + } + UnicodeProperty.addUnique(valueAlias, result); + if (aux2 != null) { + String xtra = (String) aux2.get(valueAlias); + if (xtra != null) + UnicodeProperty.addUnique(xtra, result); + } + return result; + } + + /* + static class DerivedPropertyWrapper extends UnicodeProperty { + UCDProperty derivedProperty; + UCD ucd; + + DerivedPropertyWrapper(int derivedPropertyID, UCD ucd) { + this.ucd = ucd; + derivedProperty = DerivedProperty.make(derivedPropertyID, ucd); + } + protected String _getVersion() { + return ucd.getVersion(); + } + + protected String _getValue(int codepoint) { + return derivedProperty.getValue(codepoint, UCD_Types.LONG); + } + protected List _getNameAliases(List result) { + if (result != null) result = new ArrayList(1); + addUnique(derivedProperty.getName(UCD_Types.SHORT), result); + addUnique(derivedProperty.getName(UCD_Types.LONG), result); + return null; + } + + protected List _getValueAliases(String valueAlias, List result) { + // TODO Auto-generated method stub + return null; + } + protected List _getAvailableValues(List result) { + // TODO Auto-generated method stub + return null; + } + + } + */ } diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 280e39a05a6..61196167fde 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2006/04/05 22:12:44 $ -* $Revision: 1.41 $ +* $Date: 2006/11/27 23:15:21 $ +* $Revision: 1.42 $ * ******************************************************************************* */ @@ -20,6 +20,8 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.BitSet; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.io.IOException; import java.io.DataInputStream; @@ -31,6 +33,7 @@ import com.ibm.text.utility.*; import com.ibm.icu.dev.test.util.BagFormatter; import com.ibm.icu.dev.test.util.UnicodeMap; import com.ibm.icu.dev.test.util.UnicodeProperty; +import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; @@ -200,7 +203,13 @@ public final class UCD implements UCD_Types { * Get the name and number (U+xxxx NAME) for a code point */ public String getCodeAndName(int codePoint, byte type) { - return getCode(codePoint) + " " + getName(codePoint, type); + return getCodeAndName(codePoint, type, null); + } + + public String getCodeAndName(int codePoint, byte type, Transliterator charTrans) { + return getCode(codePoint) + + (charTrans == null ? " " : " ( " + charTrans.transliterate(UTF16.valueOf(codePoint)) + " ) ") + + getName(codePoint, type); } /** @@ -208,14 +217,18 @@ public final class UCD implements UCD_Types { * separated by ", " */ public String getCodeAndName(String s, byte type) { + return getCodeAndName(s,type,null); + } + + public String getCodeAndName(String s, byte type, Transliterator charTrans) { if (s == null || s.length() == 0) return "NULL"; - if (s.length() == 1) return getCodeAndName(s.charAt(0)); // fast path + if (s.length() == 1) return getCodeAndName(s.charAt(0), type, charTrans); // fast path StringBuffer result = new StringBuffer(); int cp; for (int i = 0; i < s.length(); i += UTF32.count16(cp)) { cp = UTF32.char32At(s, i); if (i > 0) result.append(", "); - result.append(getCodeAndName(cp)); + result.append(getCodeAndName(cp, type, charTrans)); } return result.toString(); } @@ -1666,24 +1679,34 @@ to guarantee identifier closure. return blockData.getSet(value, result); } + static final Matcher blockPattern = Pattern.compile("([0-9A-F]+)\\s*(?:[.][.]|[;])\\s*([0-9A-F]+)\\s*[;](.*)").matcher(""); private void loadBlocks() { blockData = new UnicodeMap(); + try { BufferedReader in = Utility.openUnicodeFile("Blocks", version, true, Utility.LATIN1); try { - while (true) { + for (int i = 1; ; ++i) { // 0000..007F; Basic Latin String line = Utility.readDataLine(in); if (line == null) break; if (line.length() == 0) continue; - int pos1 = line.indexOf('.'); - int pos2 = line.indexOf(';', pos1); + if (!blockPattern.reset(line).matches()) { + throw new IllegalArgumentException("Bad line: " + line); + } +// int pos1 = line.indexOf(';'); +// int pos2 = line.indexOf(';', pos1+1); //lastBlock = new BlockData(); - int start = Integer.parseInt(line.substring(0, pos1), 16); - int end = Integer.parseInt(line.substring(pos1+2, pos2), 16); - String name = line.substring(pos2+1).trim().replace(' ', '_'); - blockData.putAll(start,end, name); + try { + int start = Integer.parseInt(blockPattern.group(1), 16); + int end = Integer.parseInt(blockPattern.group(2), 16); + String name = blockPattern.group(3).trim().replace(' ', '_'); + blockData.putAll(start,end, name); + } catch (RuntimeException e) { + System.err.println("Failed on line " + i + "\t" + line); + throw e; + } } blockData.setMissing("No_Block"); } finally { diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-old.txt b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-old.txt new file mode 100644 index 00000000000..12ca3d3472f --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-old.txt @@ -0,0 +1,273 @@ +# Invariance tests +# Each line indicates an invariant set relationship to be tested, +# and is of the form: +# +# line := set relation set +# +# relation := '=' // has identical contents to +# := ('>' | '⊃') // is proper superset of +# := ('≥' | '⊇') // is superset of +# := ('<' | '⊂') // is proper subset of +# := ('≤' | '⊆') // is subset of +# := '!' // has no intersection +# := '?' // none of the above (they overlap, and neither contains the other) +# +# A set is a standard UnicodeSet, but where $pv can be used to express properties +# +# pv := '$' '×'? prop (('=' | ':') value)? +# +# The × indicates that the property is the previous released version. +# That is, if the version is 4.0.1, then the × version is 4.0.0 +# If the value is missing, it is defaulted to true +# If the value is of the form «...», then the ... is interpreted as a regular expression +# The property can be the short or long form as in the PropertyAliases.txt +# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt +# +# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in +# Perl or other regular-expression languages. Examples: +# [$General_Category:Unassigned-[a-zA-Z]] +# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html +# +# WARNING: do not use \p{...} or [:...:] syntax, since those will be +# ICU's current version of properties, not the current snapshot's. +# Use the $ notation for properties (listed above) instead. +# +# When this file is parsed, an error message may contain <@> +# to indicate the location of an error in the input line. +# The Show command can be used to list any set on the console, for comparison. + +# General Constants +Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation] +Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol] +Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark] + +##### EXAMPLES OF USAGE ##### + +#Show [[^$gc:unassigned]-[^$×gc:unassigned]-[^$dt:none]] +#$GC:Zs ! $GC:Zp +#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter +$GC:Zs ? $Name:«.*SPACE.*» +#$Script:Common ! [$Alphabetic - $Math] + +# $Pattern_Whitespace = [$Whitespace \u200E \u200F] +# $Pattern_Syntax = [$gcAllSymbols $gcAllPunctuation [\u2190-\u2BFF\u2e00-\u2e7F]] +# $Pattern_Syntax ! $Alphabetic +# $Pattern_Syntax ! $ID_Continue + +# [$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126] + +# $script:greek = $×script:greek +# $gc:lm = $script:inherited + +# Examples of parsing errors + +# $LBA:Neutral = $GC:Zp # example of non-existant property +# $LB:foo = $GC:Zp # example of non-existant value +# $GC:Zs @ $GC:Zp # example of unknown relation + +#### REAL INVARIANTS FOLLOW #### + +# For illustration, different alias styles are used + +$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse] +$LB:OP = $GC:Ps +$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl] +$Dash ⊃ [$GC:Pd] +$Script:Common ! [$GC:Mn $GC:Me \u200c \u200d] +$Script:Inherited ⊆ [$GC:Mn $GC:Me \u200c \u200d] +# [$Alphabetic] ! $Script:Common +# & [$Decomposition_Type:None $Decomposition_Type:Canonical] + +$Alphabetic ⊃ [$Uppercase $Lowercase] + +# Numbers: the following must be equal + +$General_Category:Decimal_Number = $Numeric_Type:Decimal + +# Decimals are 0..9 + +Let $decimalValue = $Numeric_Value:«[0-9].0» +$decimalValue ⊇ $General_Category:Decimal_Number + +# All and only those items with numeric types have numeric values + +Let $anyNumericValue = $Numeric_Value:«-?[0-9]+.[0-9]+» +[$Numeric_Type:Decimal $Numeric_Type:Digit $Numeric_Type:Numeric] = $anyNumericValue + +# Canonical decompositions (minus exclusions) must be identical across releases +[$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion] + +# Identifiers must be backwards compatible +$ID_Start ⊇ $×ID_Start +$ID_Continue ⊇ $×ID_Continue +$XID_Start ⊇ $×XID_Start +$XID_Continue ⊇ $×XID_Continue + +# Continue must contain start +$ID_Continue ⊇ $ID_Start +$XID_Continue ⊇ $XID_Start + +# Identifiers can't intersect pattern stuff +$ID_Continue ! [$Pattern_Whitespace $Pattern_Syntax] +$Pattern_Whitespace ! [$ID_Continue $Pattern_Syntax] +$Pattern_Syntax ! [$ID_Continue $Pattern_Whitespace] + +$XID_Continue ! [$Pattern_Whitespace $Pattern_Syntax] +$Pattern_Whitespace ! [$XID_Continue $Pattern_Syntax] +$Pattern_Syntax ! [$XID_Continue $Pattern_Whitespace] + +# Test SA characters + +# They are limited to certain scripts: +Let $SAScripts = [$script:thai $script:lao $script:myanmar $script:khmer $script:Tai_Le $script:New_Tai_Lue] +$SAScripts ⊇ $LineBreak:SA + +# And in those scripts, they are all the alphabetic spacing characters, plus some odd Cf & Mn +[$SAScripts & [$Alphabetic $gc:cf $gc:Mn \u19DE \u19DF]] = [$SAScripts & [$LineBreak:SA $LineBreak:CM]] + +#MY TEST +#Show [$gc:Mn - $Alphabetic] +#Show [$Alphabetic & $gc:Mn] + +# Try removing M* from alphabetic, and matching to SA +#Show [$SAScripts & [$Alphabetic $gc:cf - $gcAllMarks]] = $LineBreak:SA + +# Try adding M* to alphabetic, and matching to SA +#Show [$SAScripts & [$Alphabetic $gc:cf $gcAllMarks]] = $LineBreak:SA + +# testing +# [$Pattern_Whitespace $Pattern_Syntax] ! [[^$WB:Format $WB:Other] \u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A] +Let $otherword = [\u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A] +Let $currentword = [[^$WB:Format $WB:Other $WB:MidNum] $Grapheme_Extend $alphabetic] +Show [$currentword $otherword - $ID_Continue] +Show [$currentword $otherword - [$alphabetic $anyNumericValue $gcAllMarks]] +Show [$otherword - $currentword] +Show [$name:«.*LETTER.*» - $alphabetic] + +# Pattern characters are invariant! +# Add after 4.1.0 +$Pattern_Whitespace = $×Pattern_Whitespace +$Pattern_Syntax = $×Pattern_Syntax + +#BIDI invariant constants +Let $R_blocks = [$block:Kharoshthi $block:Hebrew $block:Cypriot_Syllabary \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF] +Let $AL_blocks = [[$block:Arabic_Supplement $block:Arabic $block:Syriac $block:Arabic $block:Thaana $block:Arabic_Presentation_Forms_A $block:Arabic_Presentation_Forms_B [\u0750-\u077F]] -$Noncharacter_Code_Point] + +#Unassigned characters in these blocks have R or AL respectively +$Bidi_Class:R ⊇ [$R_blocks & $gc:Cn] +$Bidi_Class:AL ⊇ [$AL_blocks & $gc:Cn] + +# There are no strong characters of the other directionalities (out of L, AL, R) in these blocks, +# and anything R or L is in the block (or RLM) +$R_blocks ! [$Bidi_Class:L $Bidi_Class:AL] +$AL_blocks ! [$Bidi_Class:L $Bidi_Class:R] +[$R_blocks $AL_blocks \u200F] ⊇ [$Bidi_Class:AL $Bidi_Class:R] + +# Derivations must match + +$Math = [$GC:Sm $Other_Math] +$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic] +$Lowercase = [$GC:Ll $Other_Lowercase] +$Uppercase = [$GC:Lu $Other_Uppercase] +$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start] +$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc $Other_ID_Continue] +$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]] +$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend] +$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend] + +# "Minimal" Other_: NOT hard requirements; just if we want to be minimal +# (Should add way to make these warnings, not errors) + +$Other_Math = [$Math - $GC:Sm] +$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]] +$Other_Lowercase = [$Lowercase - $GC:Ll] +$Other_Uppercase = [$Uppercase - $GC:Lu] +$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]] +$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]] +$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]] + +# =========================== + +# POSIX Compatibility Properties (UTS#18) +# http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html + +# constants + +Let $SP = [\u0020] # [\N{space}] +Let $TAB = [\u0009] # [\N{CHARACTER TABULATION}] +Let $LF = [\u000A] # \N{linefeed} +Let $VTAB = [\u000B] # [\N{LINE TABULATION}] +Let $FF = [\u000C] # [\N{formfeed}] +Let $CR = [\u000D] # \N{carriage return} +Let $NEL = [\u0085] # \N{next line} +Let $ZWNJ = [\u200C] # [\N{ZERO WIDTH NON-JOINER}] +Let $ZWJ = [\u200D] # [\N{ZERO WIDTH JOINER}] + +Let $strange = [\u24B6-\u24E9] + +# Unassigned, Control, Format, Private_Use, Surrogate, +# Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter, +# Nonspacing_Mark, Enclosing_Mark, Spacing_Mark, +# Decimal_Number, Letter_Number, Other_Number, +# Space_Separator, Line_Separator, Paragraph_Separator, +# Dash_Punctuation, Open_Punctuation, Close_Punctuation, Connector_Punctuation, Other_Punctuation, Initial_Punctuation, Final_Punctuation +# Math_Symbol, Currency_Symbol, Modifier_Symbol, Other_Symbol + +# UTS Rules + +Let $alpha = [$Alphabetic $strange] # $Uppercase $ZWNJ $ZWJ] +Let $lower = $Lowercase +Let $upper = [$Uppercase] +Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha] +Let $digit = $gc:Decimal_Number +Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both! +Let $alnum = [$alpha $digit] +Let $space = $Whitespace +Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]] +Let $cntrl = $gc:Control +Let $graph = [^$space $gc:Control $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ] +Let $print = [$graph $blank - $cntrl] +Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation] + +# =========================== + +# POSIX locale definition file constraints + +$upper ! [$cntrl $digit $punct $space] +$upper ≥ [A-Z] + +$lower ! [$cntrl $digit $punct $space] +$lower ≥ [a-z] + +$alpha ! [$cntrl $digit $punct $space] +$alpha ≥ [$lower $upper] + +$digit ≥ [0-9] + +$alnum = [$alpha $digit] + +$space ! [$upper $lower $alpha $digit $graph $xdigit] +$space ≥ [$SP $FF $LF $CR] # $TAB $VTAB $NEL] +$space ≥ $blank + +$cntrl ! [$upper $lower $alpha $digit $punct $graph $print $xdigit] + +$punct ! [$upper $lower $alpha $digit $cntrl $xdigit $SP] + +$graph ≥ [$upper $lower $alpha $digit $xdigit $punct] +$graph ! [$SP $cntrl] + +$print ≥ [$upper $lower $alpha $digit $xdigit $punct $graph $SP] +$print ! $cntrl + +$xdigit ≥ [$digit [a-f A-F]] + +$blank ≥ [$SP $TAB] + +# Extra POSIX 'POSIX locale' constraints + +$cntrl ≥ [\u0000-\u001F] + +$punct ≥ [[\u0021-\u007E] - [0-9 A-Z a-z]] + +[$alpha $lower $upper $punct $digit $xdigit $alnum $space $blank $cntrl $graph $print $word] = [^$gc:unassigned $gc:surrogate] diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-reallyold.txt b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-reallyold.txt new file mode 100644 index 00000000000..e90c4cbbfa2 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants-reallyold.txt @@ -0,0 +1,188 @@ +# Invariance tests +# dummy commit. +# Each line indicates an invariant set relationship to be tested, +# and is of the form: +# +# line := set relation set +# +# relation := '=' // has identical contents to +# := ('>' | '⊃') // is proper superset of +# := ('≥' | '⊇') // is superset of +# := ('<' | '⊂') // is proper subset of +# := ('≤' | '⊆') // is subset of +# := '!' // has no intersection +# := '?' // none of the above (they overlap, and neither contains the other) +# +# A set is a standard UnicodeSet, but where $pv can be used to express properties +# +# pv := '$' '×'? prop (('=' | ':') value)? +# +# The × indicates that the property is the previous released version. +# That is, if the version is 4.0.1, then the × version is 4.0.0 +# If the value is missing, it is defaulted to true +# If the value is of the form «...», then the ... is interpreted as a regular expression +# The property can be the short or long form as in the PropertyAliases.txt +# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt +# +# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in +# Perl or other regular-expression languages. Examples: +# [$General_Category:Unassigned-[a-zA-Z]] +# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html +# +# WARNING: do not use \p{...} or [:...:] syntax, since those will be +# ICU's current version of properties, not the current snapshot's. +# Use the $ notation for properties (listed above) instead. +# +# When this file is parsed, an error message may contain <@> +# to indicate the location of an error in the input line. + +# The following not very interesting, but show examples of use + +#$GC:Zs ! $GC:Zp +#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter +$GC:Zs ? $Name:«.*SPACE.*» + +# [$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126] + +# Examples of parsing errors + +# $LBA:Neutral = $GC:Zp # example of non-existant property +# $LB:foo = $GC:Zp # example of non-existant value +# $GC:Zs @ $GC:Zp # example of unknown relation + +# The following should be real invariants +# For illustration, different alias styles are used + +$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse] +$LB:OP = $GC:Ps +$General_Category:Decimal_Number = $Numeric_Type:Decimal +$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl] +$Dash ⊃ [$GC:Pd] +$Script:Common ! [$GC:Mn $GC:Me] +$Script:Common ! [$Alphabetic - $Math] +$Alphabetic ⊃ [$Uppercase $Lowercase] + +# Comparisons across versions + +$ID_Start ⊇ $×ID_Start +$ID_Continue ⊇ $×ID_Continue +[$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion] + +#$age:4.0.1 = $age4.0.0 + +# Derivations + +$Math = [$GC:Sm $Other_Math] +$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic] +$Lowercase = [$GC:Ll $Other_Lowercase] +$Uppercase = [$GC:Lu $Other_Uppercase] +$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start] +$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc $Other_ID_Continue] +$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]] +$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend] +$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend] + +# "Minimal" Other_: NOT hard requirements; just if we want to be minimal + +$Other_Math = [$Math - $GC:Sm] +$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]] +$Other_Lowercase = [$Lowercase - $GC:Ll] +$Other_Uppercase = [$Uppercase - $GC:Lu] +$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]] +$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]] +$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]] + +# Testing +# $script:greek = $×script:greek +# $gc:lm = $script:inherited + +# =========================== + +# Compatibility Properties (UTS#18) +# http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html + +# constants + +Let $SP = [\u0020] # [\N{space}] +Let $TAB = [\u0009] # [\N{CHARACTER TABULATION}] +Let $LF = [\u000A] # \N{linefeed} +Let $VTAB = [\u000B] # [\N{LINE TABULATION}] +Let $FF = [\u000C] # [\N{formfeed}] +Let $CR = [\u000D] # \N{carriage return} +Let $NEL = [\u0085] # \N{next line} +Let $ZWNJ = [\u200C] # [\N{ZERO WIDTH NON-JOINER}] +Let $ZWJ = [\u200D] # [\N{ZERO WIDTH JOINER}] + +Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation] +Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol] +Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark] +Let $strange = [\u24B6-\u24E9] + +# Unassigned, Control, Format, Private_Use, Surrogate, +# Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter, +# Nonspacing_Mark, Enclosing_Mark, Spacing_Mark, +# Decimal_Number, Letter_Number, Other_Number, +# Space_Separator, Line_Separator, Paragraph_Separator, +# Dash_Punctuation, Open_Punctuation, Close_Punctuation, Connector_Punctuation, Other_Punctuation, Initial_Punctuation, Final_Punctuation +# Math_Symbol, Currency_Symbol, Modifier_Symbol, Other_Symbol + +# UTS Rules + +Let $alpha = [$Alphabetic $strange] # $Uppercase $ZWNJ $ZWJ] +Let $lower = $Lowercase +Let $upper = [$Uppercase] +Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha] +Let $digit = $gc:Decimal_Number +Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both! +Let $alnum = [$alpha $digit] +Let $space = $Whitespace +Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]] +Let $cntrl = $gc:Control +Let $graph = [^$space $gc:Control $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ] +Let $print = [$graph $blank - $cntrl] +Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation] + +# =========================== + +# POSIX locale definition file constraints + +$upper ! [$cntrl $digit $punct $space] +$upper ≥ [A-Z] + +$lower ! [$cntrl $digit $punct $space] +$lower ≥ [a-z] + +$alpha ! [$cntrl $digit $punct $space] +$alpha ≥ [$lower $upper] + +$digit ≥ [0-9] + +$alnum = [$alpha $digit] + +$space ! [$upper $lower $alpha $digit $graph $xdigit] +$space ≥ [$SP $FF $LF $CR] # $TAB $VTAB $NEL] +$space ≥ $blank + +$cntrl ! [$upper $lower $alpha $digit $punct $graph $print $xdigit] + +$punct ! [$upper $lower $alpha $digit $cntrl $xdigit $SP] + +$graph ≥ [$upper $lower $alpha $digit $xdigit $punct] +$graph ! [$SP $cntrl] + +$print ≥ [$upper $lower $alpha $digit $xdigit $punct $graph $SP] +$print ! $cntrl + +$xdigit ≥ [$digit [a-f A-F]] + +$blank ≥ [$SP $TAB] + +# Extra POSIX 'POSIX locale' constraints + +$cntrl ≥ [\u0000-\u001F] + +$punct ≥ [[\u0021-\u007E] - [0-9 A-Z a-z]] + +[$alpha $lower $upper $punct $digit $xdigit $alnum $space $blank $cntrl $graph $print $word] = [^$gc:unassigned $gc:surrogate] + +