From eb6243eb238e1c7a03f24d0b71df19d7c7a8c0d9 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Sat, 22 Jun 2002 21:05:34 +0000 Subject: [PATCH] added more conformance tests X-SVN-Rev: 8928 --- tools/unicodetools/com/ibm/text/UCA/Main.java | 10 +++-- .../com/ibm/text/UCA/WriteCollationData.java | 45 +++++++++++++++---- .../com/ibm/text/UCD/DerivedProperty.java | 6 +-- .../com/ibm/text/UCD/Normalizer.java | 17 ++++++- .../ibm/text/data/CheckCollationValidity.html | 2 +- .../com/ibm/text/data/FractionalUCA.txt | 2 +- .../com/ibm/text/data/FractionalUCA_long.txt | 2 +- 7 files changed, 64 insertions(+), 20 deletions(-) diff --git a/tools/unicodetools/com/ibm/text/UCA/Main.java b/tools/unicodetools/com/ibm/text/UCA/Main.java index 203bebd66bf..89f93cc0298 100644 --- a/tools/unicodetools/com/ibm/text/UCA/Main.java +++ b/tools/unicodetools/com/ibm/text/UCA/Main.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $ -* $Date: 2002/06/15 02:47:12 $ -* $Revision: 1.6 $ +* $Date: 2002/06/22 21:02:16 $ +* $Revision: 1.7 $ * ******************************************************************************* */ @@ -19,8 +19,10 @@ import com.ibm.text.utility.*; public class Main { static final String UCDVersion = ""; static final String[] ICU_FILES = {"writeCollationValidityLog", "FractionalUCA", - "writeconformance", "writeconformanceshifted", - "WriteRules", "WriteRulesWithNames", "WriteRulesXML"}; + "WriteRules", "WriteRulesWithNames", "WriteRulesXML", + "writeconformance", "writeconformanceshifted", + "short", "writeconformance", "writeconformanceshifted", + }; public static void main(String args[]) throws Exception { diff --git a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java index f26f5a39dd5..bb6a895fb70 100644 --- a/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java +++ b/tools/unicodetools/com/ibm/text/UCA/WriteCollationData.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $ -* $Date: 2002/06/22 01:21:08 $ -* $Revision: 1.21 $ +* $Date: 2002/06/22 21:02:16 $ +* $Revision: 1.22 $ * ******************************************************************************* */ @@ -292,6 +292,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types { static void writeConformance(String filename, byte option, boolean shortPrint) throws IOException { + Default.setUCD(); //UCD ucd30 = UCD.make("3.0.0"); /* @@ -405,12 +406,16 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON //log.println(source); char extra = source.charAt(source.length()-1); String clipped = source.substring(0, source.length()-1); + if (clipped.charAt(0) == LOW_ACCENT && extra != LOW_ACCENT) { + extra = LOW_ACCENT; + clipped = source.substring(1); + } if (!shortPrint) { log.print(Utility.hex(source)); log.print( ";\t# " + (extra != LOW_ACCENT ? extra : '.') + " " + ucd.getName(clipped, SHORT) + "\t" + UCA.toString(key)); } else { - log.print(source + "\t" + Utility.hex(clipped)); + log.print(Utility.hex(source) + "\t" + Utility.hex(clipped)); } log.println(); } @@ -424,17 +429,41 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON addStringX(UTF32.valueOf32(x), option); } - static final char LOW_ACCENT = '\u0325'; + static final char LOW_ACCENT = '\u0334'; + static int addCounter = 0; static void addStringX(String s, byte option) { + int firstChar = UTF16.charAt(s,0); + // add characters with different strengths, to verify the order addStringY(s + 'a', option); - addStringY(s + 'A', option); - addStringY(s + 'á', option); addStringY(s + 'b', option); - addStringY(s + LOW_ACCENT, option); + addStringY(s + 'á', option); + addStringY(s + 'A', option); addStringY(s + '!', option); + if (option == SHIFTED && collator.isVariable(firstChar)) addStringY(s + LOW_ACCENT, option); + + // NOW, if the character decomposes, or is a combining mark (non-zero), try combinations + + if (Default.ucd.getCombiningClass(firstChar) > 0 + || !Default.nfd.isNormalized(s) && !Default.ucd.isHangulSyllable(firstChar)) { + // if it ends with a non-starter, try the decompositions. + String decomp = Default.nfd.normalize(s); + if (Default.ucd.getCombiningClass(UTF16.charAt(decomp, decomp.length()-1)) > 0) { + if (canIt == null) canIt = new CanonicalIterator("."); + canIt.setSource(s + LOW_ACCENT); + int limit = 4; + for (String can = canIt.next(); can != null; can = canIt.next()) { + if (s.equals(can)) continue; + if (--limit < 0) continue; // just include a sampling + addStringY(can, option); + // System.out.println(addCounter++ + " Adding " + Default.ucd.getCodeAndName(can)); + } + } + } } + static CanonicalIterator canIt = null; + static char counter; static void addStringY(String s, byte option) { @@ -2162,7 +2191,7 @@ F900..FAFF; CJK Compatibility Ideographs Set additionalSet = new HashSet(); System.out.println("Loading canonical iterator"); - CanonicalIterator canIt = new CanonicalIterator("."); + if (canIt == null) canIt = new CanonicalIterator("."); Iterator it2 = contentsForCanonicalIteration.iterator(); System.out.println("Adding any FCD equivalents that have different sort keys"); while (it2.hasNext()) { diff --git a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java index 9039052aa7d..1b0b4e1b59b 100644 --- a/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java +++ b/tools/unicodetools/com/ibm/text/UCD/DerivedProperty.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $ -* $Date: 2002/06/22 01:21:09 $ -* $Revision: 1.15 $ +* $Date: 2002/06/22 21:02:16 $ +* $Revision: 1.16 $ * ******************************************************************************* */ @@ -745,8 +745,8 @@ of characters, the first of which has a non-zero combining class. public static void test() { Default.setUCD(); - DerivedProperty dprop = new DerivedProperty(Default.ucd); /* + DerivedProperty dprop = new DerivedProperty(Default.ucd); for (int j = 0; j < LIMIT; ++j) { System.out.println(); System.out.println(j + "\t" + dprop.getName(j)); diff --git a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java index f05e38c7ad7..47f3c33971b 100644 --- a/tools/unicodetools/com/ibm/text/UCD/Normalizer.java +++ b/tools/unicodetools/com/ibm/text/UCD/Normalizer.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $ -* $Date: 2002/06/13 21:14:05 $ -* $Revision: 1.10 $ +* $Date: 2002/06/22 21:02:16 $ +* $Revision: 1.11 $ * ******************************************************************************* */ @@ -226,6 +226,19 @@ public final class Normalizer implements UCD_Types { return !data.normalizationDiffers(ch, composition, compatibility); } + /** + * Utility: Checks whether there is a recursive decomposition of a character from the + * Unicode Character Database. It is compatibility or canonical according to the particular + * normalizer. + * @param ch the source character + */ + public boolean isNormalized(String s) { + if (UTF16.countCodePoint(s) > 1) { + return !data.normalizationDiffers(UTF16.charAt(s,0), composition, compatibility); + } + return s.equals(normalize(s)); // TODO: OPTIMIZE LATER + } + /** * Utility: Gets recursive decomposition of a character from the * Unicode Character Database. diff --git a/tools/unicodetools/com/ibm/text/data/CheckCollationValidity.html b/tools/unicodetools/com/ibm/text/data/CheckCollationValidity.html index 6d003d93a2c..491a7ab5635 100644 --- a/tools/unicodetools/com/ibm/text/data/CheckCollationValidity.html +++ b/tools/unicodetools/com/ibm/text/data/CheckCollationValidity.html @@ -1,7 +1,7 @@

1. Mismatches when NFD is OFF

Date:Fri Jun 21 16:56:03 PDT 2002

Date:Sat Jun 22 13:56:49 PDT 2002

File Version:-3.1.1d1

Alternate Handling = NON_IGNORABLE