From 70433b182bc5133732370807b4599202bbdf94bd Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 3 Dec 2001 19:29:35 +0000 Subject: [PATCH] ICU-0 Added Unicode property NF_SKIPPABLE X-SVN-Rev: 7274 --- .../com/ibm/text/UCA/GenOverlap.java | 6 +- tools/unicodetools/com/ibm/text/UCA/UCA.java | 54 +++- .../com/ibm/text/UCD/DerivedProperty.java | 130 ++++---- tools/unicodetools/com/ibm/text/UCD/Main.java | 5 +- .../com/ibm/text/UCD/NFSkippable.java | 284 ++++++++++++++++++ .../com/ibm/text/UCD/Normalizer.java | 59 ++-- .../com/ibm/text/UCD/PropertyLister.java | 10 +- .../ibm/text/UCD/PropertyValueAliasHeader.txt | 50 +++ tools/unicodetools/com/ibm/text/UCD/UCD.java | 14 +- .../com/ibm/text/UCD/UCD_Names.java | 6 +- .../com/ibm/text/UCD/UCD_Types.java | 14 +- .../com/ibm/text/UCD/UnicodeProperty.java | 44 +++ 12 files changed, 578 insertions(+), 98 deletions(-) create mode 100644 tools/unicodetools/com/ibm/text/UCD/NFSkippable.java create mode 100644 tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt create mode 100644 tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java diff --git a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java index 97278a46bb9..f9b0e57eada 100644 --- a/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java +++ b/tools/unicodetools/com/ibm/text/UCA/GenOverlap.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $ -* $Date: 2001/10/25 20:35:42 $ -* $Revision: 1.5 $ +* $Date: 2001/12/03 19:29:35 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -90,6 +90,7 @@ public class GenOverlap implements UCD_Types { addString(s, currCEList); } + /* for (int cp = 0x10000; cp <= 0x10FFFF; ++cp) { if (!ucd.isRepresented(cp)) continue; byte decompType = ucd.getDecompositionType(cp); @@ -100,6 +101,7 @@ public class GenOverlap implements UCD_Types { System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist); } } + */ Utility.fixDot(); System.out.println("# Completes Count: " + completes.size()); diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index caab14ed69b..f7f04b964c8 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2001/10/31 00:01:28 $ -* $Revision: 1.7 $ +* $Date: 2001/12/03 19:29:35 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -74,7 +74,7 @@ final public class UCA implements Comparator { * Version of the UCA tables to use */ //private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7"; - public static final String VERSION = ""; // ""; // "-2.1.9d7"; + public static final String VERSION = "-3.1.1d1"; // ""; // "-2.1.9d7"; public static final String ALLFILES = "allkeys"; // null if not there /** @@ -1019,6 +1019,7 @@ final public class UCA implements Comparator { // of the build process. String probe = String.valueOf(ch); Object value = contractingTable.get(probe); + if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch)); // We loop, trying to add successive characters to the longest substring. while (index < decompositionBuffer.length()) { @@ -1304,7 +1305,7 @@ final public class UCA implements Comparator { IntStack tempStack = new IntStack(100); // used for reversal StringBuffer multiChars = new StringBuffer(); // used for contracting chars String inputLine = ""; - while (true) { // try { + while (true) try { inputLine = in.readLine(); if (inputLine == null) break; // means file is done String line = cleanLine(inputLine); // remove comments, extra whitespace @@ -1326,14 +1327,17 @@ final public class UCA implements Comparator { // collect characters char value = getChar(line, position); + fixSurrogateContraction(value); char value2 = getChar(line, position); multiChars.setLength(0); // clear buffer if (value2 != NOT_A_CHAR) { + fixSurrogateContraction(value2); multiChars.append(value); // append until we get terminator multiChars.append(value2); while (true) { value2 = getChar(line, position); if (value2 == NOT_A_CHAR) break; + fixSurrogateContraction(value2); multiChars.append(value2); } } @@ -1410,9 +1414,21 @@ final public class UCA implements Comparator { //} catch (Exception e) { // throw new IllegalArgumentException("Malformed line: " + inputLine + "\n " // + e.getClass().getName() + ": " + e.getMessage()); + } catch (RuntimeException e) { + System.out.println("Error on line: " + inputLine); + throw e; } } + private void fixSurrogateContraction(char ch) { + //if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0])); + if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return; + String chs = String.valueOf(ch); + Object probe = contractingTable.get(chs); + if (probe != null) return; + contractingTable.put(chs, new Integer(0)); + } + private void concat(int[] ces1, int[] ces2) { } @@ -1479,6 +1495,7 @@ final public class UCA implements Comparator { Enumeration enum = contractingTable.keys(); while (enum.hasMoreElements()) { String sequence = (String)enum.nextElement(); + //System.out.println("Contraction: " + Utility.hex(sequence)); for (int i = sequence.length()-1; i > 0; --i) { String shorter = sequence.substring(0,i); Object probe = contractingTable.get(shorter); @@ -1550,9 +1567,18 @@ final public class UCA implements Comparator { * On output, updated to point to the next place to search. *@return the character, or NOT_A_CHAR when done */ + + // NOTE in case of surrogates, we buffer up the second character!! + char charBuffer = 0; + private char getChar(String line, int[] position) { - int start = position[0]; char ch; + if (charBuffer != 0) { + ch = charBuffer; + charBuffer = 0; + return ch; + } + int start = position[0]; while (true) { // trim whitespace if (start >= line.length()) return NOT_A_CHAR; ch = line.charAt(start); @@ -1560,13 +1586,25 @@ final public class UCA implements Comparator { start++; } // from above, we have at least one char - if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F')) { - position[0] = start + 4; - return (char)Integer.parseInt(line.substring(start,start+4),16); + int hexLimit = start; + while ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F')) { + hexLimit++; + ch = line.charAt(hexLimit); } + if (hexLimit >= start + 4) { + position[0] = hexLimit; + int cp = Integer.parseInt(line.substring(start,hexLimit),16); + if (cp <= 0xFFFF) return (char)cp; + //DEBUGCHAR = true; + charBuffer = UTF16.getTrailSurrogate(cp); + return UTF16.getLeadSurrogate(cp); + } + return NOT_A_CHAR; } + boolean DEBUGCHAR = false; + BitSet primarySet = new BitSet(); BitSet secondarySet = new BitSet(); BitSet tertiarySet = new BitSet(); diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index 866ecbcc7e8..a3a62bf19a8 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2001/10/26 23:33:08 $ -* $Revision: 1.6 $ +* $Date: 2001/12/03 19:29:35 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -22,24 +22,28 @@ public class DerivedProperty implements UCD_Types { // ADD CONSTANT to UCD_TYPES + static public UnicodeProperty getProperty(int derivedPropertyID, UCD ucd) { + return new DerivedProperty(ucd).dprops[derivedPropertyID]; + } + public DerivedProperty(UCD ucd) { ucdData = ucd; } public String getHeader(int propNumber) { - DProp dp = dprops[propNumber]; + UnicodeProperty dp = dprops[propNumber]; if (dp != null) return dp.getHeader(); else return "Unimplemented!!"; } public String getName(int propNumber, byte style) { - DProp dp = dprops[propNumber]; + UnicodeProperty dp = dprops[propNumber]; if (dp != null) return dp.getName(style); else return "Unimplemented!!"; } public String getProperty(int cp, int propNumber) { - DProp dp = dprops[propNumber]; + UnicodeProperty dp = dprops[propNumber]; if (dp != null) return dp.getProperty(cp); else return "Unimplemented!!"; } @@ -67,16 +71,17 @@ public class DerivedProperty implements UCD_Types { return dprops[propNumber].getProperty(int cp); } */ - private DProp[] dprops = new DProp[50]; + private UnicodeProperty[] dprops = new UnicodeProperty[50]; private Normalizer[] nf = new Normalizer[4]; private Normalizer nfd, nfc, nfkd, nfkc; - static final String[] NAME = {"NFD", "NFC", "NFKD", "NFKC"}; + static final String[] CaseNames = { "Uppercase", "Lowercase", "Mixedcase"}; - - private abstract class DProp { + + /* + private abstract static class UnicodeProperty { boolean testStatus = false; byte defaultStyle = LONG; String name, shortName, header; @@ -90,13 +95,14 @@ public class DerivedProperty implements UCD_Types { public boolean propertyVaries() { return false; } public String getProperty(int cp) { return hasProperty(cp) ? name : ""; } } + */ - class ExDProp extends DProp { + class ExDProp extends UnicodeProperty { Normalizer nfx; ExDProp(int i) { - nfx = nf[i-ExpandsOnNFD]; - name = "Expands_On_" + NAME[i-ExpandsOnNFD]; - shortName = "XO_" + NAME[i-ExpandsOnNFD]; + nfx = nf[i]; + name = "Expands_On_" + nfx.getName(); + shortName = "XO_" + nfx.getName(); header = "# Derived Property: " + name + "\r\n# Generated according to UAX #15." + "\r\n# Characters whose normalized length is not one." @@ -111,16 +117,15 @@ public class DerivedProperty implements UCD_Types { } }; - class NF_UnsafeStartProp extends DProp { + class NF_UnsafeStartProp extends UnicodeProperty { Normalizer nfx; - int prop; + //int prop; NF_UnsafeStartProp(int i) { - testStatus = true; - prop = i-NFD_UnsafeStart; - nfx = nf[prop]; - name = NAME[prop] + "_UnsafeStart"; - shortName = NAME[prop] + "_SS"; + isStandard = false; + nfx = nf[i]; + name = nfx.getName() + "_UnsafeStart"; + shortName = nfx.getName() + "_SS"; header = "# Derived Property: " + name + "\r\n# Generated according to UAX #15." + "\r\n# Characters that are cc==0, BUT which may interact with previous characters." @@ -131,20 +136,20 @@ public class DerivedProperty implements UCD_Types { String norm = nfx.normalize(cp); int first = UTF16.charAt(norm, 0); if (ucdData.getCombiningClass(first) != 0) return true; - if ((prop == 1 || prop == 3) + if (nfx.isComposition() && dprops[NFC_TrailingZero].hasProperty(first)) return true; // 1,3 == composing return false; } }; - class NFC_Prop extends DProp { + class NFC_Prop extends UnicodeProperty { BitSet bitset; boolean filter = false; boolean keepNonZero = true; NFC_Prop(int i) { - testStatus = true; + isStandard = false; BitSet[] bitsets = new BitSet[3]; switch(i) { case NFC_Leading: bitsets[0] = bitset = new BitSet(); break; @@ -181,27 +186,27 @@ public class DerivedProperty implements UCD_Types { }; }; - class GenDProp extends DProp { + class GenDProp extends UnicodeProperty { Normalizer nfx; Normalizer nfComp = null; GenDProp (int i) { - testStatus = true; - nfx = nf[i-GenNFD]; - name = NAME[i-GenNFD]; + isStandard = false; + nfx = nf[i]; + name = nfx.getName(); String compName = "the character itself"; - if (i == GenNFKC || i == GenNFD) { + if (i == NFKC || i == NFD) { name += "-NFC"; nfComp = nfc; compName = "NFC for the character"; - } else if (i == GenNFKD) { + } else if (i == NFKD) { name += "-NFD"; nfComp = nfd; compName = "NFD for the character"; } header = "# Derived Property: " + name - + "\r\n# Lists characters in normalized form " + NAME[i-GenNFD] + "." + + "\r\n# Lists characters in normalized form " + nfx.getName() + "." + "\r\n# Only those characters whith normalized forms are DIFFERENT from " + compName + " are listed!" + "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact." + "\r\n# It is NOT sufficient to replace characters one-by-one with these results!"; @@ -237,10 +242,10 @@ public class DerivedProperty implements UCD_Types { boolean hasProperty(int cp) { return getProperty(cp).length() != 0; } }; - class CaseDProp extends DProp { + class CaseDProp extends UnicodeProperty { byte val; CaseDProp (int i) { - testStatus = true; + isStandard = false; val = (i == Missing_Uppercase ? Lu : i == Missing_Lowercase ? Ll : Lt); name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase]; header = "# Derived Property: " + name @@ -256,16 +261,16 @@ public class DerivedProperty implements UCD_Types { } }; - class QuickDProp extends DProp { + class QuickDProp extends UnicodeProperty { String NO; String MAYBE; Normalizer nfx; QuickDProp (int i) { - nfx = nf[i - QuickNFD]; - NO = NAME[i-QuickNFD] + "_NO"; - MAYBE = NAME[i-QuickNFD] + "_MAYBE"; - name = NAME[i-QuickNFD] + "_QuickCheck"; - shortName = NAME[i-QuickNFD] + "_QC"; + nfx = nf[i]; + NO = nfx.getName() + "_NO"; + MAYBE = nfx.getName() + "_MAYBE"; + name = nfx.getName() + "_QuickCheck"; + shortName = nfx.getName() + "_QC"; header = "# Derived Property: " + name + "\r\n# Generated from computing decomposibles" + ((i == QuickNFC || i == QuickNFKC) @@ -288,11 +293,11 @@ public class DerivedProperty implements UCD_Types { nfkc = nf[3] = new Normalizer(Normalizer.NFKC); for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) { - dprops[i] = new ExDProp(i); + dprops[i] = new ExDProp(i-ExpandsOnNFD); } for (int i = GenNFD; i <= GenNFKC; ++i) { - dprops[i] = new GenDProp(i); + dprops[i] = new GenDProp(i-GenNFD); } for (int i = NFC_Leading; i <= NFC_Resulting; ++i) { @@ -300,10 +305,10 @@ public class DerivedProperty implements UCD_Types { } for (int i = NFD_UnsafeStart; i <= NFKC_UnsafeStart; ++i) { - dprops[i] = new NF_UnsafeStartProp(i); + dprops[i] = new NF_UnsafeStartProp(i-NFD_UnsafeStart); } - dprops[ID_Start] = new DProp() { + dprops[ID_Start] = new UnicodeProperty() { { name = "ID_Start"; shortName = "IDS"; @@ -316,7 +321,7 @@ public class DerivedProperty implements UCD_Types { } }; - dprops[ID_Continue_NO_Cf] = new DProp() { + dprops[ID_Continue_NO_Cf] = new UnicodeProperty() { { name = "ID_Continue"; shortName = "IDC"; @@ -330,7 +335,7 @@ public class DerivedProperty implements UCD_Types { } }; - dprops[Mod_ID_Start] = new DProp() { + dprops[Mod_ID_Start] = new UnicodeProperty() { { name = "XID_Start"; shortName = "XIDS"; @@ -345,7 +350,7 @@ public class DerivedProperty implements UCD_Types { } }; - dprops[Mod_ID_Continue_NO_Cf] = new DProp() { + dprops[Mod_ID_Continue_NO_Cf] = new UnicodeProperty() { { name = "XID_Continue"; shortName = "XIDC"; @@ -361,7 +366,7 @@ public class DerivedProperty implements UCD_Types { } }; - dprops[PropMath] = new DProp() { + dprops[PropMath] = new UnicodeProperty() { { name = "Math"; shortName = name; @@ -376,7 +381,7 @@ public class DerivedProperty implements UCD_Types { } }; - dprops[PropAlphabetic] = new DProp() { + dprops[PropAlphabetic] = new UnicodeProperty() { { name = "Alphabetic"; shortName = "Alpha"; @@ -391,7 +396,7 @@ public class DerivedProperty implements UCD_Types { } }; - dprops[PropLowercase] = new DProp() { + dprops[PropLowercase] = new UnicodeProperty() { { name = "Lowercase"; shortName = "Lower"; @@ -406,7 +411,7 @@ public class DerivedProperty implements UCD_Types { } }; - dprops[PropUppercase] = new DProp() { + dprops[PropUppercase] = new UnicodeProperty() { { name = "Uppercase"; shortName = "Upper"; @@ -432,7 +437,7 @@ including all characters whose canonical decomposition consists of a single char file by including all characters whose canonical decomposition consists of a sequence of characters, the first of which has a non-zero combining class. */ - dprops[FullCompExclusion] = new DProp() { + dprops[FullCompExclusion] = new UnicodeProperty() { { name = "Full_Composition_Exclusion"; shortName = "Comp_Ex"; @@ -451,9 +456,9 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[FullCompInclusion] = new DProp() { + dprops[FullCompInclusion] = new UnicodeProperty() { { - testStatus = true; + isStandard = false; name = "Full_Composition_Inclusion"; shortName = "Comp_In"; defaultStyle = SHORT; @@ -471,7 +476,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[FC_NFKC_Closure] = new DProp() { + dprops[FC_NFKC_Closure] = new UnicodeProperty() { { name = "FC_NFKC_Closure"; shortName = "FC_NFKC"; @@ -491,7 +496,7 @@ of characters, the first of which has a non-zero combining class. boolean hasProperty(int cp) { return getProperty(cp).length() != 0; } }; - dprops[FC_NFC_Closure] = new DProp() { + dprops[FC_NFC_Closure] = new UnicodeProperty() { { name = "FC_NFC_Closure"; shortName = "FC_NFC"; @@ -512,10 +517,10 @@ of characters, the first of which has a non-zero combining class. }; for (int i = QuickNFD; i <= QuickNFKC; ++i) { - dprops[i] = new QuickDProp(i); + dprops[i] = new QuickDProp(i - QuickNFD); } - dprops[DefaultIgnorable] = new DProp() { + dprops[DefaultIgnorable] = new UnicodeProperty() { { name = "Default_Ignorable_Code_Point"; shortName = "DI"; @@ -538,7 +543,7 @@ of characters, the first of which has a non-zero combining class. # GraphemeBase := */ - dprops[GraphemeExtend] = new DProp() { + dprops[GraphemeExtend] = new UnicodeProperty() { { name = "Grapheme_Extend"; shortName = "GrExt"; @@ -556,7 +561,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[Other_Case_Ignorable] = new DProp() { + dprops[Other_Case_Ignorable] = new UnicodeProperty() { { name = "Other_Case_Ignorable"; shortName = "OCI"; @@ -577,7 +582,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[Type_i] = new DProp() { + dprops[Type_i] = new UnicodeProperty() { { name = "Special_Dotted"; shortName = "SDot"; @@ -606,7 +611,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[Case_Ignorable] = new DProp() { + dprops[Case_Ignorable] = new UnicodeProperty() { { name = "Case_Ignorable"; shortName = "CI"; @@ -621,7 +626,7 @@ of characters, the first of which has a non-zero combining class. } }; - dprops[GraphemeBase] = new DProp() { + dprops[GraphemeBase] = new UnicodeProperty() { { name = "Grapheme_Base"; shortName = "GrBase"; @@ -648,6 +653,9 @@ of characters, the first of which has a non-zero combining class. if (cat == Ll || ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll; if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat; + + if (true) throw new IllegalArgumentException("FIX nf[2]"); + if (!nf[2].normalizationDiffers(cp)) return Lo; String norm = nf[2].normalize(cp); diff --git a/tools/unicodetools/com/ibm/text/UCD/Main.java b/tools/unicodetools/com/ibm/text/UCD/Main.java index a63d2dec1f0..af3e7b43b6a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Main.java +++ b/tools/unicodetools/com/ibm/text/UCD/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $ -* $Date: 2001/11/13 02:31:55 $ -* $Revision: 1.5 $ +* $Date: 2001/12/03 19:29:35 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -35,6 +35,7 @@ public final class Main { } else if (arg.equalsIgnoreCase("build")) { ConvertUCD.main(new String[]{ucdVersion}); } else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i]; + else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null); else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML(); else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed(); else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main(); diff --git a/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java b/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java new file mode 100644 index 00000000000..34bce599a15 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java @@ -0,0 +1,284 @@ +package com.ibm.text.UCD; +import com.ibm.text.UTF16; +import com.ibm.text.UnicodeSet; +import java.util.BitSet; +import com.ibm.text.utility.*; +import java.io.PrintWriter; + + +public final class NFSkippable extends UnicodeProperty { + + static final boolean DEBUG = false; + + private Normalizer nf; + private Normalizer nfd; + private boolean composes; + private int[] realTrailers = new int[100]; + private int realTrailerCount = 0; + + public NFSkippable(byte normalizerMode, String unicodeVersion) { + isStandard = false; + ucd = UCD.make(unicodeVersion); + nf = new Normalizer(normalizerMode, unicodeVersion); + name = nf.getName() + "_Skippable"; + shortName = nf.getName() + "_Skip"; + header = "# Derived Property: " + name + + "\r\n# Generated according to UAX #15." + + "\r\n# Characters that don't interact with any others in this normalization form." + + "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact." + + "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!"; + + nfd = new Normalizer(Normalizer.NFD, unicodeVersion); + composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC; + + // preprocess to find possible trailers + + if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) { + if (nf.isTrailing(cp2)) { + //System.out.println("Trailing: " + ucd.getCodeAndName(cp2)); + if (ucd.isTrailingJamo(cp2)) { + //System.out.println("Jamo: " + ucd.getCodeAndName(cp2)); + continue; + } + realTrailers[realTrailerCount++] = cp2; + } + } + Utility.fixDot(); + //System.out.println("trailer count: " + realTrailerCount); + } + + /** A skippable character is
+ * a) unassigned, or ALL of the following:
+ * b) of combining class 0.
+ * c) not decomposed by this normalization form.
+ * AND if NKC or NFKC,
+ * d) can never compose with a previous character.
+ * e) can never compose with a following character.
+ * f) can never change if another character is added. + * Example: a-breve might satisfy all but f, but if you + * add an ogonek it changes to a-ogonek + breve + */ + + String cause = ""; + + public boolean hasProperty(int cp) { + // quick check on some special classes + if (DEBUG) cause = "\t\tunassigned"; + if (!ucd.isAssigned(cp)) return true; + + if (DEBUG) cause = "\t\tnf differs"; + if (nf.normalizationDiffers(cp)) return false; + + if (DEBUG) cause = "\t\tnon-zero cc"; + if (ucd.getCombiningClass(cp) != 0) return false; + + if (DEBUG) cause = ""; + if (!composes) return true; + + // now special checks for composing normalizers + if (DEBUG) cause = "\t\tleading"; + if (nf.isLeading(cp)) return false; + + if (DEBUG) cause = "\t\ttrailing"; + if (nf.isTrailing(cp)) return false; + + // OPTIMIZATION -- careful + // If there is no NFD decomposition, then this character's accents can't be + // "displaced", so we don't have to test further + + if (DEBUG) cause = "\t\tno decomp"; + if (!nfd.normalizationDiffers(cp)) return true; + + // OPTIMIZATION -- careful + // Hangul syllables are skippable IFF they are isLeadingJamoComposition + if (ucd.isHangulSyllable(cp)) return !ucd.isLeadingJamoComposition(cp); + + // We now see if adding another character causes a problem. + // brute force for now!! + // We do skip the trailing Jamo, since those never displace! + + StringBuffer base = new StringBuffer(UTF16.valueOf(cp)); + int baseLen = base.length(); + for (int i = 0; i < realTrailerCount; ++i) { + base.setLength(baseLen); // shorten if needed + base.append(UTF16.valueOf(realTrailers[i])); + String probe = base.toString(); + String result = nf.normalize(probe); + if (!result.equals(probe)) { + if (DEBUG) cause = "\t\tinteracts with " + ucd.getCodeAndName(realTrailers[i]); + return false; + } + } + + // passed the sieve, so we are ok + if (DEBUG) cause = ""; + return true; + } + + // both the following should go into UTF16 + + public static String replace(String source, int toReplace, int replacement) { + if (0 <= toReplace && toReplace <= 0xFFFF + && 0 <= replacement && replacement <= 0xFFFF) { + return source.replace((char)toReplace, (char)replacement); + } + return replace(source, UTF16.valueOf(toReplace), UTF16.valueOf(replacement)); + } + + public static String replace(String source, String toReplace, String replacement) { + int pos = 0; + StringBuffer result = new StringBuffer(source.length()); + while (true) { + int newPos = source.indexOf(toReplace, pos); + if (newPos >= 0) { + result.append(source.substring(pos, newPos)); + result.append(replacement); + pos = newPos + toReplace.length(); + } else if (pos != 0) { + result.append(source.substring(pos)); + return result.toString(); + } else { + return source; // no change necessary + } + } + } + + static void writeStringInPieces(PrintWriter pw, String s, String term) { + int start; + int end; + int lineLen = 64; + for (start = 0; ; start = end) { + if (start == 0) pw.print("\t \""); + else pw.print("\t+ \""); + end = s.length(); + if (end > start + lineLen) end = start + lineLen; + + // if we have a slash in the last 5 characters, backup + + int lastSlash = s.lastIndexOf('\\', end); + if (lastSlash >= end-5) end = lastSlash; + + // backup if we broke on a \ + + while (end > start && s.charAt(end-1) == '\\') --end; + + pw.print(s.substring(start, end)); + if (end == s.length()) { + pw.println('"' + term); + break; + } else { + pw.println('"'); + } + } + } + + static void testWriteStringInPieces() { + String test = + "[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD" + + "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD\\u00F" + + "F-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137\\u0139-" + + "\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165\\u0168-\\u017" + + "E\\u01A0-\\u01A1\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u" + + "01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F\\u0226"; + PrintWriter pw = new PrintWriter(System.out); + writeStringInPieces(pw,test,""); + writeStringInPieces(pw,replace(test, "\\", "\\\\"),""); + + pw.flush(); + } + + static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller + + public static void main (String[] args) throws java.io.IOException { + + String version = ""; // Unicode version, "" = latest released + + PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt"); + + for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) { + UnicodeProperty up = DerivedProperty.getProperty(mode, UCD.make(version)); + generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up); + } + + for (byte mode = NFD; mode <= NFKC; ++mode) { + NFSkippable skipper = new NFSkippable(mode,version); + generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper); + } + + out.close(); + } + + static void generateSet(PrintWriter out, String label, UnicodeProperty up) { + System.out.println("Generating: " + up.getName(NORMAL)); + UnicodeSet result = new UnicodeSet(); + for (int cp = 0; cp <= limit; ++cp) { + Utility.dot(cp); + if (up.hasProperty(cp)) result.add(cp); + } + Utility.fixDot(); + + String rSet = result.toPattern(true); + rSet = replace(rSet, "\\U", "\\\\U"); + out.println(label + " = new UnicodeSet("); + writeStringInPieces(out, rSet, ", false);"); + out.println(); + + rSet = result.toPattern(false); + out.println("/*Unicode: "); + writeStringInPieces(out, rSet, "*/"); + out.println(); + out.flush(); + } + + /* + // DerivedProperty dp = new DerivedProperty(UCD.make(version)); + + System.out.println(skipper.getName(NORMAL)); + + UnicodeSet result = new UnicodeSet(); + for (int cp = 0; cp <= limit; ++cp) { + Utility.dot(cp); + if (skipper.hasProperty(cp)) result.add(cp); + } + Utility.fixDot(); + + String rSet = result.toPattern(true); + rSet = replace(rSet, "\\U", "\\\\U"); + out.println("\tSKIPPABLE[" + skipper.getName(NORMAL) + + "] = new UnicodeSet("); + writeStringInPieces(out, rSet, ", false);"); + out.println(); + + rSet = result.toPattern(false); + out.println("/*Unicode: "); + */ + //writeStringInPieces(out, rSet, "*/"); + /*out.println(); + out.flush(); + + if (false) { + NFSkippable skipper = new NFSkippable(Normalizer.NFC,""); + NFSkippable skipper2 = new NFSkippable(Normalizer.NFKC,""); + for (int cp = 0; cp <= 0x10FFFF; ++cp) { + if (cp > 0xFF) { + if (!skipper.ucd.isAssigned(cp)) continue; + byte cat = skipper.ucd.getCategory(cp); + if (cat == PRIVATE_USE || cat == SURROGATE) continue; + if (skipper.ucd.getCombiningClass(cp) != 0) continue; + if (skipper.nf.normalizationDiffers(cp)) continue; + if ((cp < 0xAC00 || cp > 0xAE00) + && cp != skipper.ucd.mapToRepresentative(cp, false)) continue; + } + + if (skipper2.hasProperty(cp) == skipper.hasProperty(cp)) continue; + + String status = (skipper.hasProperty(cp) ? " SKIPc " : "NOSKIPc ") + + (skipper2.hasProperty(cp) ? " SKIPkc " : "NOSKIPkc "); + System.out.println(status + + skipper.ucd.getCodeAndName(cp) + + skipper.cause); + } + } + */ + +} \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java index 299de02b008..9a727b3316a 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java +++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $ -* $Date: 2001/10/25 20:33:46 $ -* $Revision: 1.4 $ +* $Date: 2001/12/03 19:29:35 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -40,8 +40,9 @@ public final class Normalizer implements UCD_Types { * Create a normalizer for a given form. */ public Normalizer(byte form, String unicodeVersion) { - this.composition = (form & COMPOSITION_MASK) != 0; - this.compatibility = (form & COMPATIBILITY_MASK) != 0; + this.form = form; + this.composition = (form & NF_COMPOSITION_MASK) != 0; + this.compatibility = (form & NF_COMPATIBILITY_MASK) != 0; this.data = getData(unicodeVersion); } @@ -53,20 +54,32 @@ public final class Normalizer implements UCD_Types { } /** - * Masks for the form selector - */ - public static final byte - COMPATIBILITY_MASK = 1, - COMPOSITION_MASK = 2; + * Return string name + */ + public static String getName(byte form) { + return UCD_Names.NF_NAME[form]; + } /** - * Normalization Form Selector - */ - public static final byte - NFD = 0 , - NFKD = COMPATIBILITY_MASK, - NFC = COMPOSITION_MASK, - NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK); + * Return string name + */ + public String getName() { + return getName(form); + } + + /** + * Does compose? + */ + public boolean isComposition() { + return composition; + } + + /** + * Does compose? + */ + public boolean isCompatibility() { + return compatibility; + } /** * Normalizes text according to the chosen form, @@ -234,6 +247,10 @@ public final class Normalizer implements UCD_Types { return this.composition ? data.isTrailing(cp) : false; } + public boolean isLeading(int cp) { + return this.composition ? data.isLeading(cp) : false; + } + // ====================================== // PRIVATES @@ -242,13 +259,14 @@ public final class Normalizer implements UCD_Types { /** * The current form. */ + private byte form; private boolean composition; private boolean compatibility; /** * Decomposes text, either canonical or compatibility, * replacing contents of the target buffer. - * @param form the normalization form. If COMPATIBILITY_MASK + * @param form the normalization form. If NF_COMPATIBILITY_MASK * bit is on in this byte, then selects the recursive * compatibility decomposition, otherwise selects * the recursive canonical decomposition. @@ -342,6 +360,7 @@ public final class Normalizer implements UCD_Types { private UCD ucd; private HashMap compTable = new HashMap(); private BitSet isSecond = new BitSet(); + private BitSet isFirst = new BitSet(); private BitSet canonicalRecompose = new BitSet(); private BitSet compatibilityRecompose = new BitSet(); static final int NOT_COMPOSITE = 0xFFFF; @@ -352,6 +371,7 @@ public final class Normalizer implements UCD_Types { if (!ucd.isAssigned(i)) continue; if (ucd.isPUA(i)) continue; if (ucd.isTrailingJamo(i)) isSecond.set(i); + if (ucd.isLeadingJamoComposition(i)) isFirst.set(i); byte dt = ucd.getDecompositionType(i); if (dt != CANONICAL) continue; if (!ucd.getBinaryProperty(i, CompositionExclusion)) { @@ -364,6 +384,7 @@ public final class Normalizer implements UCD_Types { } int a = UTF16.charAt(s, 0); if (ucd.getCombiningClass(a) != 0) continue; + isFirst.set(a); int b = UTF16.charAt(s, UTF16.getCharCount(a)); isSecond.set(b); @@ -429,6 +450,10 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA return isSecond.get(cp); } + boolean isLeading(int cp) { + return isFirst.get(cp); + } + boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) { byte dt = ucd.getDecompositionType(cp); if (!composition) { diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java b/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java index 5ef990e8811..ca6d2c9c198 100644 --- a/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyLister.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $ -* $Date: 2001/09/19 23:33:16 $ -* $Revision: 1.3 $ +* $Date: 2001/12/03 19:29:35 $ +* $Revision: 1.4 $ * ******************************************************************************* */ @@ -15,6 +15,7 @@ package com.ibm.text.UCD; import java.io.*; import com.ibm.text.utility.*; +import com.ibm.text.UnicodeSet; import java.text.NumberFormat; @@ -32,6 +33,7 @@ abstract public class PropertyLister implements UCD_Types { protected int firstRealCp = -2; protected int lastRealCp = -2; protected boolean alwaysBreaks = false; // set to true if property only breaks + private UnicodeSet set = new UnicodeSet(); public static final byte INCLUDE = 0, BREAK = 1, CONTINUE = 2, EXCLUDE = 3; @@ -65,6 +67,7 @@ abstract public class PropertyLister implements UCD_Types { public void format(int startCp, int endCp, int realCount) { try { + set.add(startCp, endCp); String prop = propertyName(startCp); if (prop.length() > 0) prop = "; " + prop; String opt = optionalName(startCp); @@ -153,6 +156,7 @@ abstract public class PropertyLister implements UCD_Types { } public int print() { + set.clear(); int count = 0; firstRealCp = -1; byte firstRealCpCat = -1; @@ -215,6 +219,8 @@ abstract public class PropertyLister implements UCD_Types { output.println(); output.println("# Total code points: " + nf.format(count)); output.println(); + System.out.println(headerString()); + System.out.println(set.toPattern(true)); return count; } diff --git a/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt new file mode 100644 index 00000000000..ac9344343ba --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/PropertyValueAliasHeader.txt @@ -0,0 +1,50 @@ +# DRAFT +# PropertyValueAliases-3.2.0.txt +# +# This file contains aliases for property values used in the UCD. +# These names can be used for XML formats of UCD data, for regular-expression +# property tests, and other programmatic textual descriptions of Unicode data. +# The names are not normative, except where they correspond to normative property +# values in the UCD. For information on which properties are normative, see +# UnicodeCharacterDatabase.html. +# +# The names may be translated in appropriate environments, and additional +# aliases may be useful. +# +# FORMAT +# +# Each line describes a property value name. +# This consists of three fields, separated by semicolons. +# +# First Field: The first field describes the property for which that +# property value name is used. +# There is one special pseudo-property: "qc" stands for any quick-check property +# +# Second Field: The second field is an abbreviated name. +# If there is no abbreviated name available, the field is marked with "n/a". +# +# Third Field: The third field is a long name. +# +# With loose matching of property names, the case distinctions, whitespace, +# and '_' are ignored. +# +# NOTE: The Block property values are in Blocks.txt, and not repeated here. +# For more information on the use of blocks, see UTR #24: Regular Expression Guidelines +# +# NOTE: Currently there is at most one abbreviated name and one long name for +# property value. However, in the future additional aliases +# may be added. In such a case, the first line for the property value +# would have the preferred alias for output. +# +# NOTE: The property value names are NOT unique across properties, especially +# with loose matches. For example, +# AL means Arabic Letter for the Bidi_Class property, and +# AL means Alpha_Left for the Combining_Class property, and +# AL means Alphabetic for the Line_Break property. +# +# In addition, some property names may be the same as some property value names: +# cc means Combining_Class property, and +# cc means the General_Category property value Control (cc) +# +# The combination of property value and property name is, however, unique. +# For more information, see UTR #24: Regular Expression Guidelines diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD.java b/tools/unicodetools/com/ibm/text/UCD/UCD.java index 38e5ea99334..5479e56a1ab 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $ -* $Date: 2001/10/26 23:33:07 $ -* $Revision: 1.5 $ +* $Date: 2001/12/03 19:29:35 $ +* $Revision: 1.6 $ * ******************************************************************************* */ @@ -977,6 +977,16 @@ to guarantee identifier closure. } return 0xFFFF; // no composition } + + static boolean isHangulSyllable(int char1) { + return SBase <= char1 && char1 < SLimit; + } + + static boolean isLeadingJamoComposition(int char1) { + return (LBase <= char1 && char1 < LLimit + || SBase <= char1 && char1 < SLimit + && ((char1 - SBase) % TCount) == 0); + } static boolean isTrailingJamo(int cp) { return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit); diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java index 281e7ae24a8..5e27dee4bdf 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Names.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $ -* $Date: 2001/11/13 02:31:55 $ -* $Revision: 1.7 $ +* $Date: 2001/12/03 19:29:35 $ +* $Revision: 1.8 $ * ******************************************************************************* */ @@ -766,6 +766,8 @@ final class UCD_Names implements UCD_Types { "P", // U+11C1; P; HANGUL JONGSEONG PHIEUPH "H", // U+11C2; H; HANGUL JONGSEONG HIEUH }; + + static final String[] NF_NAME = {"NFD", "NFC", "NFKD", "NFKC"}; /* static { diff --git a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java index ba98d049b52..c374a9e1885 100644 --- a/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java +++ b/tools/unicodetools/com/ibm/text/UCD/UCD_Types.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $ -* $Date: 2001/10/25 20:33:46 $ -* $Revision: 1.4 $ +* $Date: 2001/12/03 19:29:34 $ +* $Revision: 1.5 $ * ******************************************************************************* */ @@ -390,6 +390,11 @@ public static byte ZAIN = 49, LIMIT_JOINING_GROUP = 50; + static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3; + public static final int + NF_COMPATIBILITY_MASK = 2, + NF_COMPOSITION_MASK = 1; + // DERIVED PROPERTY static final int @@ -448,6 +453,11 @@ public static byte NFKD_UnsafeStart = 39, NFKC_UnsafeStart = 40, + NFD_Skippable = 41, + NFC_Skippable = 42, + NFKD_Skippable = 43, + NFKC_Skippable = 44, + LIMIT = 41; } \ No newline at end of file diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java b/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java new file mode 100644 index 00000000000..fe4aad7d945 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java @@ -0,0 +1,44 @@ +package com.ibm.text.UCD; +public abstract class UnicodeProperty implements UCD_Types { + + protected UCD ucd; + protected boolean isStandard = true; + protected byte defaultStyle = LONG; + protected String name, shortName, header; + + // Old Names for compatibility + boolean isTest() { return isStandard(); } + + /** + * Is it part of the standard, or just for my testing? + */ + public boolean isStandard() { return isStandard; } + + /** + * Get the property name. Style is SHORT, NORMAL, LONG + */ + public String getName(byte style) { + if (style == NORMAL) style = defaultStyle; + return style < LONG ? shortName : name; + } + + /** Header used in DerivedXXX files + */ + public String getHeader() { return header; } + + /** + * Does getProperty vary in contents? + */ + public boolean propertyVaries() { return false; } + + /** + * Get the property value as a string, or "" if hasProperty is false + */ + public String getProperty(int cp) { return hasProperty(cp) ? name : ""; } + + /** + * Does it have the propertyValue + */ + abstract boolean hasProperty(int cp); + } +