From 9c6b10e2dc1a77df216c05f52b9ac87df7b36a22 Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Wed, 26 Feb 2003 00:35:09 +0000 Subject: [PATCH] updated for 4.0 X-SVN-Rev: 11164 --- .../UCD/GenerateStandardizedVariants.java | 113 +++++++++++++++ .../UCD/StandardizedVariants-Template.html | 137 ++++++++++++++++++ .../com/ibm/text/UCD/TestNameUniqueness.java | 109 ++++++++++++++ 3 files changed, 359 insertions(+) create mode 100644 tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java create mode 100644 tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html create mode 100644 tools/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java b/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java new file mode 100644 index 00000000000..37a2a5abf5f --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java @@ -0,0 +1,113 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java,v $ +* $Date: 2003/02/26 00:35:09 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; +import com.ibm.text.utility.*; +import com.ibm.icu.text.UTF16; +import com.ibm.icu.text.UnicodeSet; +import java.util.*; +import java.io.*; + +public final class GenerateStandardizedVariants implements UCD_Types { + + static public String showVarGlyphs(String code0, String code1, String shape) { + System.out.println(code0 + ", " + code1 + ", [" + shape + "]"); + + String abbShape = ""; + if (shape.length() != 0) { + abbShape = '-' + shape.substring(0,4); + if (shape.endsWith("-feminine")) abbShape += "fem"; + } + + return "U+" + code0 + "+U+" + code1 + "/" + shape 
+            + ""; + } + +/* +# Field 0: the variation sequence +# Field 1: the description of the desired appearance +# Field 2: where the appearance is only different in in particular shaping environments +# this field lists them. The possible values are: isolated, initial, medial, final. +# If more than one is present, there are spaces between them. +*/ + static public void generate() throws IOException { + Default.setUCD(); + + // read the data and compose the table + + String table = ""; + + String[] splits = new String[4]; + String[] codes = new String[2]; + String[] shapes = new String[4]; + + BufferedReader in = Utility.openUnicodeFile("StandardizedVariants", Default.ucdVersion, true, Utility.LATIN1); + while (true) { + String line = Utility.readDataLine(in); + if (line == null) break; + if (line.length() == 0) continue; + + int count = Utility.split(line, ';', splits); + int codeCount = Utility.split(splits[0], ' ', codes); + int code = Utility.codePointFromHex(codes[0]); + + // 03E2 + + table += "\n"; + table += "\n"; + + String shape = splits[2].trim(); + if (shape.equals("all")) shape = ""; + + table += "\n"; + + // http://www.unicode.org/cgi-bin/varglyph?24-1820-180B-fina + // http://www.unicode.org/cgi-bin/varglyph?24-222A-FE00 + + table += "\n"; + + table += "\n"; + table += ""; + } + in.close(); + table += "
Rep GlyphCharacter SequenceContextAlt GlyphDescription of variant appearance
U+" + codes[0] + "" + splits[0] + "" + Utility.replace(shape, " ", "
") + "
"; + if (shape.length() == 0) { + table += showVarGlyphs(codes[0], codes[1], ""); + } else { + int shapeCount = Utility.split(shape, ' ', shapes); + for (int i = 0; i < shapeCount; ++i) { + if (i != 0) table += " "; + table += showVarGlyphs(codes[0], codes[1], shapes[i]); + } + } + table += "" + Default.ucd.getName(code) + " " + splits[1] + "
"; + + // now write out the results + + String directory = "DerivedData/"; + String filename = directory + "StandardizedVariants.html"; + PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX); + String[] batName = {""}; + String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true), batName); + + String[] replacementList = { + "@revision@", Default.ucd.getVersion(), + "@date@", Default.getDate(), + "@table@", table}; + + Utility.appendFile("StandardizedVariants-Template.html", Utility.UTF8, out, replacementList); + + out.close(); + Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]); + } +} diff --git a/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html b/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html new file mode 100644 index 00000000000..85c3e5bccf7 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html @@ -0,0 +1,137 @@ + + + + + + + + + + + +Standardized Variants + + + + + + + + + + + + +
[Unicode]  Unicode + Character Database
 
+
+

Standardized Variants

+ + + + + + + + + + + + + + + + + + + + + + + + + + + +
Revision@revision@
AuthorsMembers of the Editorial Committee
Date@date@
This Versionhttp://www.unicode.org/Public/3.2-Update/StandardizedVariants-@revision@.html
Previous Versionhttp://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html
Latest Versionhttp://www.unicode.org/Public/UNIDATA/StandardizedVariants.html
+


+ Summary

+
+

This file provides a visual display of the standard variant sequences + derived from StandardizedVariants.txt.

+
+

Status

+
+

The file and the files described herein are part of the Unicode + Character Database (UCD) and are governed by the UCD + Terms of Use stated at the end.

+
+
+

Introduction

+

The tables here exhaustively lists the valid, registered + combinations of base character plus variation indicator. All combinations not + listed in StandardizedVariants.txt are unspecified and are reserved for future + standardization; no conformant process may interpret them as standardized + variants. Variation selectors and their use are described in The Unicode + Standard.

+

These mathematical variants are all produced with the addition of Variation + Selector 1 (VS1 or U+FE00) to mathematical operator base characters. There is + no variation according to context. The Mongolian variants use the Mongolian + Variant Selectors, and may vary according to context. That is, if a contextual + shape is not listed below, then the variation sequence has an unmodified + appearance. At this time no Han variants exist.

+
+

Note: The glyphs used to show the variations + are often derived from different physical fonts than the representative + glyphs in the standard. They may therefore exhibit minor differences in + size, proportion, or weight unrelated to the intentional difference + in feature that is the defining element of the variation. Such minor + differences should be ignored. Likewise, in some cases the existing + representative fonts may not yet contain newly encoded characters and hence + some representative glyphs shown in these tables may have a slightly + different style than others.

+
+

@table@

+
+

UCD Terms of Use

+

Disclaimer

+
+

The Unicode Character Database is provided as is by Unicode, Inc. No + claims are made as to fitness for any particular purpose. No warranties of + any kind are expressed or implied. The recipient agrees to determine + applicability of information provided. If this file has been purchased on + magnetic or optical media from Unicode, Inc., the sole remedy for any claim + will be exchange of defective media within 90 days of receipt.

+

This disclaimer is applicable for all other data files accompanying + the Unicode Character Database, some of which have been compiled by the + Unicode Consortium, and some of which have been supplied by other sources.

+
+

Limitations on Rights to Redistribute This Data

+
+

Recipient is granted the right to make copies in any form for internal + distribution and to freely use the information supplied in the creation of + products supporting the UnicodeTM Standard. The files in the + Unicode Character Database can be redistributed to third parties or other + organizations (whether for profit or not) as long as this notice and the + disclaimer notice are retained. Information can be extracted from these + files and used in documentation or programs, as long as there is an + accompanying notice indicating the source.

+
+
+
+
+ + + + +
Access to Copyright and terms of use
+ +
+
+
+ + + + diff --git a/tools/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java b/tools/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java new file mode 100644 index 00000000000..155ba809cf5 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java @@ -0,0 +1,109 @@ +/** +******************************************************************************* +* Copyright (C) 1996-2001, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +* +* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java,v $ +* $Date: 2003/02/26 00:35:09 $ +* $Revision: 1.1 $ +* +******************************************************************************* +*/ + +package com.ibm.text.UCD; + +import java.util.*; +import java.io.*; +import java.text.DateFormat; +import java.text.SimpleDateFormat; + +import com.ibm.text.utility.*; +import com.ibm.icu.text.UnicodeSet; + +public class TestNameUniqueness implements UCD_Types { + + public static void test() throws IOException { + Default.setUCD(); + new TestNameUniqueness().checkNames(); + } + + Map names = new HashMap(); + int[] charCount = new int[128]; + int[] samples = new int[128]; + + void checkNames() throws IOException { + PrintWriter out = Utility.openPrintWriter("name_uniqueness.txt", Utility.LATIN1_WINDOWS); + try { + out.println("Collisions"); + out.println(); + for (int cp = 0; cp < 0x10FFFF; ++cp) { + Utility.dot(cp); + if (!Default.ucd.isAllocated(cp)) continue; + if (Default.ucd.hasComputableName(cp)) continue; + int cat = Default.ucd.getCategory(cp); + if (cat == Cc) continue; + + String name = Default.ucd.getName(cp); + String processedName = processName(cp, name); + Integer existing = (Integer) names.get(processedName); + if (existing != null) { + out.println("Collision between: " + + Default.ucd.getCodeAndName(existing.intValue()) + + ", " + Default.ucd.getCodeAndName(cp)); + } else { + names.put(processedName, new Integer(cp)); + } + } + out.println(); + out.println("Samples"); + out.println(); + for (int i = 0; i < charCount.length; ++i) { + int count = charCount[i]; + if (count == 0) continue; + String sampleName = Default.ucd.getCodeAndName(samples[i]); + out.println(count + "\t'" + ((char)i) + + "'\t" + Default.ucd.getCodeAndName(samples[i]) + + "\t=>\t" + processName(samples[i], Default.ucd.getName(samples[i]))); + } + out.println(); + out.println("Name Samples"); + out.println(); + for (int i = 0; i < 256; ++i) { + int cat = Default.ucd.getCategory(i); + if (cat == Cc) continue; + out.println(Default.ucd.getCodeAndName(i) + + "\t=>\t" + processName(i, Default.ucd.getName(i))); + } + } finally { + out.close(); + } + } + + static final String[][] replacements = { + //{"SMALL LETTER", ""}, + {"LETTER", ""}, + {"CHARACTER", ""}, + {"DIGIT", ""}, + {"SIGN", ""}, + //{"WITH", ""}, + }; + + StringBuffer processNamesBuffer = new StringBuffer(); + + String processName(int codePoint, String name) { + name = Utility.replace(name, replacements); + processNamesBuffer.setLength(0); + for (int i = 0; i < name.length(); ++i) { + char c = name.charAt(i); + ++charCount[c]; + if (samples[c] == 0) samples[c] = codePoint; + if ('A' <= c && c <= 'Z' + || '0' <= c && c <= '9') processNamesBuffer.append(c); + + } + if (processNamesBuffer.length() == name.length()) return name; + return processNamesBuffer.toString(); + } +} +