From cbe4468265639ab7942ad230ed520fdade014f17 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 24 Jun 2002 15:25:10 +0000 Subject: [PATCH] added new parameters to rules, fixed a bit of the xml X-SVN-Rev: 8930 --- tools/unicodetools/com/ibm/text/UCA/UCA.java | 10 +- .../com/ibm/text/UCA/WriteCollationData.java | 126 +++++++++++++----- .../com/ibm/text/UCD/Normalizer.java | 6 +- .../com/ibm/text/utility/Utility.java | 8 +- 4 files changed, 109 insertions(+), 41 deletions(-) diff --git a/tools/unicodetools/com/ibm/text/UCA/UCA.java b/tools/unicodetools/com/ibm/text/UCA/UCA.java index ffd674516cf..d64497980bb 100644 --- a/tools/unicodetools/com/ibm/text/UCA/UCA.java +++ b/tools/unicodetools/com/ibm/text/UCA/UCA.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $ -* $Date: 2002/06/15 02:47:12 $ -* $Revision: 1.14 $ +* $Date: 2002/06/24 15:25:10 $ +* $Revision: 1.15 $ * ******************************************************************************* */ @@ -199,6 +199,7 @@ final public class UCA implements Comparator, UCA_Types { hangulBuffer.setLength(0); // clear hangul buffer char weight4 = '\u0000'; // DEFAULT FOR NON_IGNORABLE + boolean lastWasVariable = false; // process CEs, building weight strings while (true) { @@ -219,8 +220,13 @@ final public class UCA implements Comparator, UCA_Types { weight4 = 0; } else if (isVariable(ce)) { // variables weight4 = getPrimary(ce); + lastWasVariable = true; ce = 0; + } else if (lastWasVariable && getPrimary(ce) == 0) { // zap trailing ignorables + ce = 0; + weight4 = 0; } else { // above variables + lastWasVariable = false; weight4 = '\uFFFF'; } break; diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index bb6a895fb70..808943150b3 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2002/06/22 21:02:16 $ -* $Revision: 1.22 $ +* $Date: 2002/06/24 15:25:10 $ +* $Revision: 1.23 $ * ******************************************************************************* */ @@ -415,7 +415,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON log.print( ";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key)); } else { - log.print(Utility.hex(source) + "\t" + Utility.hex(clipped)); + log.print(Utility.hex(source) + ";\t" + Utility.hex(clipped)); } log.println(); } @@ -430,6 +430,11 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON } static final char LOW_ACCENT = '\u0334'; + static final String SUPPLEMENTARY_ACCENT = UTF16.valueOf(0x1D165); + static final String COMPLETELY_IGNOREABLE = "\u0001"; + static final String COMPLETELY_IGNOREABLE_ACCENT = "\u0591"; + static final String[] CONTRACTION_TEST = {SUPPLEMENTARY_ACCENT, COMPLETELY_IGNOREABLE, COMPLETELY_IGNOREABLE_ACCENT}; + static int addCounter = 0; static void addStringX(String s, byte option) { @@ -460,6 +465,17 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON } } } + if (UTF16.countCodePoint(s) > 1) { + for (int i = 1; i < s.length(); ++i) { + if (UTF16.isLeadSurrogate(s.charAt(i-1))) continue; // skip if in middle of supplementary + + for (int j = 0; j < CONTRACTION_TEST.length; ++j) { + String extra = s.substring(0,i) + CONTRACTION_TEST[j] + s.substring(i); + addStringY(extra + 'a', option); + System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(extra)); + } + } + } } static CanonicalIterator canIt = null; @@ -1399,13 +1415,14 @@ F900..FAFF; CJK Compatibility Ideographs }; if (option == IN_XML) { - log.println(""); + log.println(""); log.println(""); - log.println(""); + log.println(""); + log.println(""); } else { log.write('\uFEFF'); // BOM for (int i = 0; i < commentText.length; ++i) { @@ -1538,38 +1555,60 @@ F900..FAFF; CJK Compatibility Ideographs String reset = ""; String resetComment = ""; int xmlReset = 0; - + boolean insertVariableTop = false; + boolean resetToParameter = false; + + int ceLayout = getCELayout(ce); + if (ceLayout == IMPLICIT) { + if (relation == PRIMARY_DIFF) { + int primary = UCA.getPrimary(ce); + int resetCp = UCA.ImplicitToCodePoint(primary, UCA.getPrimary(ces[1])); + + int[] ces2 = new int[50]; + int len2 = collator.getCEs(UTF16.valueOf(resetCp), true, ces2); + relation = getStrengthDifference(ces, len, ces2, len2); + + reset = quoteOperand(UTF16.valueOf(resetCp)); + resetComment = ucd.getCodeAndName(resetCp); + // lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY); + xmlReset = 2; + } + // lastCJKPrimary = primary; + } else if (ceLayout != getCELayout(lastCE) || firstTime) { + resetToParameter = true; + switch (ceLayout) { + case T_IGNORE: reset = "last tertiary ignorable"; break; + case S_IGNORE: reset = "last secondary ignorable"; break; + case P_IGNORE: reset = "last primary ignorable"; break; + case VARIABLE: reset = "last non-ignorable"; break; + case NON_IGNORE: /*reset = "top"; */ insertVariableTop = true; break; + case TRAILING: reset = "last trailing"; break; + } + } + + /* if (firstTime || collator.getPrimary(lastCE) == 0 && collator.getPrimary(ce) != 0 || collator.getSecondary(lastCE) == 0 && collator.getSecondary(ce) != 0 || collator.getTertiary(lastCE) == 0 && collator.getTertiary(ce) != 0) { if (collator.getPrimary(ce) != 0) { - reset = "[top]"; + + } else if (collator.getSecondary(ce) != 0) { + reset = "[last secondary ignorable]"; + } else if (collator.getTertiary(ce) != 0) { + reset = "[last tertiary ignorable]"; } else { - reset = quoteOperand(chr); + + //reset = quoteOperand(chr); } - } else if (variableTop != 0 && (ce & 0xFFFF0000L) > variableTop) { - reset = "[variable\\u0020top]"; + } else if (variableTop != 0 && ce > variableTop) { + reset = "[variable top]"; xmlReset = 1; variableTop = 0; } else { int primary = collator.getPrimary(ce); if (UCA.isImplicitLeadPrimary(primary)) { - if (relation == PRIMARY_DIFF) { - int resetCp = UCA.ImplicitToCodePoint(primary, UCA.getPrimary(ces[1])); - - int[] ces2 = new int[50]; - int len2 = collator.getCEs(UTF16.valueOf(resetCp), true, ces2); - relation = getStrengthDifference(ces, len, ces2, len2); - - reset = quoteOperand(UTF16.valueOf(resetCp)); - resetComment = ucd.getCodeAndName(resetCp); - // lastCE = UCA.makeKey(primary, UCA.NEUTRAL_SECONDARY, UCA.NEUTRAL_TERTIARY); - xmlReset = 2; - } - // lastCJKPrimary = primary; - } - } + */ /* if (primary >= 0x3400) { @@ -1607,26 +1646,30 @@ F900..FAFF; CJK Compatibility Ideographs // print results if (option == IN_XML) { - if (xmlReset == 1) log.print(""); + if (insertVariableTop) log.println(XML_RELATION_NAMES[0] + ""); /*log.print(" "); */ - if (xmlReset == 2) { - log.print("" + Utility.quoteXML(reset)); + if (reset.length() != 0) { + log.println("" + + (resetToParameter ? "" : Utility.quoteXML(reset)) + + (resetComment.length() != 0 ? "": "")); } if (!firstTime) { log.print(" <" + XML_RELATION_NAMES[relation] + "/>"); - log.print(Utility.quoteXML(chr)); + log.println(Utility.quoteXML(chr)); //log.print(""); } if (expansion.length() > 0) { - log.print("" + Utility.quoteXML(expansion)); + log.println("" + Utility.quoteXML(expansion)); } } else { - if (reset.length() != 0) log.println("& " + reset + if (insertVariableTop) log.println(RELATION_NAMES[0] + " [variable top]"); + if (reset.length() != 0) log.println("& " + + (resetToParameter ? "[" : "") + reset + (resetToParameter ? "]" : "") + (resetComment.length() != 0 ? "\t\t# " + resetComment : "")); if (!firstTime) log.print(RELATION_NAMES[relation] + " " + quoteOperand(chr)); if (expansion.length() > 0) log.print(" / " + quoteOperand(expansion)); @@ -1641,12 +1684,31 @@ F900..FAFF; CJK Compatibility Ideographs firstTime = false; } // log.println("& [top]"); // RESET - if (option == IN_XML) log.println(""); + if (option == IN_XML) log.println(""); log2.close(); log.close(); Utility.fixDot(); } + static final int NONE = 0, T_IGNORE = 1, S_IGNORE = 2, P_IGNORE = 3, VARIABLE = 4, NON_IGNORE = 5, IMPLICIT = 6, TRAILING = 7; + + static int getCELayout(int ce) { + int primary = collator.getPrimary(ce); + int secondary = collator.getSecondary(ce); + int tertiary = collator.getSecondary(ce); + if (primary == 0) { + if (secondary == 0) { + if (tertiary == 0) return T_IGNORE; + return S_IGNORE; + } + return P_IGNORE; + } + if (collator.isVariable(ce)) return VARIABLE; + if (primary < UNSUPPORTED_BASE) return NON_IGNORE; + if (primary < UNSUPPORTED_LIMIT) return IMPLICIT; + return TRAILING; + } + static long getPrimary(int[] ces, int len) { if (len <= 0) return 0; if (UCA.isImplicitLeadCE(ces[0])) { diff --git a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java index 47f3c33971b..c4ac9711110 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java +++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $ -* $Date: 2002/06/22 21:02:16 $ -* $Revision: 1.11 $ +* $Date: 2002/06/24 15:25:10 $ +* $Revision: 1.12 $ * ******************************************************************************* */ @@ -212,7 +212,7 @@ public final class Normalizer implements UCD_Types { * @return value from 0 to 255 */ - public short getCanonicalClass(char ch) { + public short getCanonicalClass(int ch) { return data.getCanonicalClass(ch); } diff --git a/tools/unicodetools/com/ibm/text/utility/Utility.java b/tools/unicodetools/com/ibm/text/utility/Utility.java index 2fc63010a5d..a5f427d078d 100644 --- a/tools/unicodetools/com/ibm/text/utility/Utility.java +++ b/tools/unicodetools/com/ibm/text/utility/Utility.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $ -* $Date: 2002/06/22 01:21:11 $ -* $Revision: 1.19 $ +* $Date: 2002/06/24 15:25:10 $ +* $Revision: 1.20 $ * ******************************************************************************* */ @@ -430,12 +430,12 @@ public final class Utility { // COMMON UTILITIES // fix noncharacters, since XML can't handle case 0xFFFE: case 0xFFFF: - return "#" + hex(c,1); + return ""; } // fix surrogates, since XML can't handle if (UTF32.isSurrogate(c)) { - return "#" + hex(c,1); + return ""; } if (c <= 0x7E || UTF8) {