From 9c6b10e2dc1a77df216c05f52b9ac87df7b36a22 Mon Sep 17 00:00:00 2001
From: Mark Davis <mark@macchiato.com>
Date: Wed, 26 Feb 2003 00:35:09 +0000
Subject: [PATCH] updated for 4.0

X-SVN-Rev: 11164
---
 .../UCD/GenerateStandardizedVariants.java     | 113 +++++++++++++++
 .../UCD/StandardizedVariants-Template.html    | 137 ++++++++++++++++++
 .../com/ibm/text/UCD/TestNameUniqueness.java  | 109 ++++++++++++++
 3 files changed, 359 insertions(+)
 create mode 100644 tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java
 create mode 100644 tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html
 create mode 100644 tools/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java
diff --git a/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java b/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java
new file mode 100644
index 00000000000..37a2a5abf5f
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java
@@ -0,0 +1,113 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and    *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java,v $
+* $Date: 2003/02/26 00:35:09 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+import com.ibm.text.utility.*;
+import com.ibm.icu.text.UTF16;
+import com.ibm.icu.text.UnicodeSet;
+import java.util.*;
+import java.io.*;
+
+public final class GenerateStandardizedVariants implements UCD_Types {
+    
+    static public String showVarGlyphs(String code0, String code1, String shape) {
+        System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
+        
+        String abbShape = "";
+        if (shape.length() != 0) {
+            abbShape = '-' + shape.substring(0,4);
+            if (shape.endsWith("-feminine")) abbShape += "fem";
+        }
+        
+        return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape 
+            + "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
+    }
+    
+/*
+#   Field 0: the variation sequence
+#   Field 1: the description of the desired appearance
+#   Field 2: where the appearance is only different in in particular shaping environments
+#	this field lists them. The possible values are: isolated, initial, medial, final.
+#	If more than one is present, there are spaces between them.
+*/
+    static public void generate() throws IOException {
+        Default.setUCD();
+        
+        // read the data and compose the table
+        
+        String table = "<table><tr><th>Rep Glyph</th><th>Character Sequence</th><th>Context</th><th width='10%'>Alt Glyph</th><th>Description of variant appearance</th></tr>";
+        
+        String[] splits = new String[4];
+        String[] codes = new String[2];
+        String[] shapes = new String[4];
+        
+        BufferedReader in = Utility.openUnicodeFile("StandardizedVariants", Default.ucdVersion, true, Utility.LATIN1);
+        while (true) {
+            String line = Utility.readDataLine(in);
+            if (line == null) break;
+            if (line.length() == 0) continue;
+            
+            int count = Utility.split(line, ';', splits);
+            int codeCount = Utility.split(splits[0], ' ', codes);
+            int code = Utility.codePointFromHex(codes[0]);
+            
+            // <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
+            
+            table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
+            table += "<td>" + splits[0] + "</td>\n";
+            
+            String shape = splits[2].trim();
+            if (shape.equals("all")) shape = "";
+            
+            table += "<td>" + Utility.replace(shape, " ", "<br>") + "</td>\n";
+            
+            // http://www.unicode.org/cgi-bin/varglyph?24-1820-180B-fina
+            // http://www.unicode.org/cgi-bin/varglyph?24-222A-FE00
+            
+            table += "<td>";
+            if (shape.length() == 0) {
+                table += showVarGlyphs(codes[0], codes[1], "");
+            } else {
+                int shapeCount = Utility.split(shape, ' ', shapes);
+                for (int i = 0; i < shapeCount; ++i) {
+                    if (i != 0) table += " ";
+                    table += showVarGlyphs(codes[0], codes[1], shapes[i]);
+                }
+            }
+            table += "</td>\n";
+            
+            table += "<td>" + Default.ucd.getName(code) + " " + splits[1] + "</td>\n";
+            table += "</tr>";
+        }
+        in.close();            
+        table += "</table>";
+     
+        // now write out the results
+        
+        String directory = "DerivedData/";
+        String filename = directory + "StandardizedVariants.html";
+        PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
+        String[] batName = {""};
+        String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true), batName);
+        
+        String[] replacementList = {
+            "@revision@", Default.ucd.getVersion(),
+            "@date@", Default.getDate(),
+            "@table@", table};
+                
+        Utility.appendFile("StandardizedVariants-Template.html", Utility.UTF8, out, replacementList);
+     
+        out.close();
+        Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
+    }
+}
diff --git a/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html b/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html
new file mode 100644
index 00000000000..85c3e5bccf7
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/StandardizedVariants-Template.html
@@ -0,0 +1,137 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
+
+       "http://www.w3.org/TR/REC-html40/loose.dtd"> 
+
+<html>
+
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<meta http-equiv="Content-Language" content="en-us">
+<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
+<meta name="ProgId" content="FrontPage.Editor.Document">
+<meta name="keywords" content="unicode, variant glyphs">
+<meta name="description" content="Describes and displays standardized variant glyphs">
+<title>Standardized Variants</title>
+<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
+</head>
+
+<body bgcolor="#ffffff">
+
+<table class="header">
+  <tr>
+    <td class="icon"><a href="http://www.unicode.org"><img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="UnicodeCharacterDatabase.html">Unicode 
+      Character Database</a></td>
+  </tr>
+  <tr>
+    <td class="gray">&nbsp;</td>
+  </tr>
+</table>
+<blockquote>
+  <h1>Standardized Variants</h1>
+  <table class="wide">
+    <tbody>
+      <tr>
+        <td valign="top" width="144">Revision</td>
+        <td valign="top">@revision@</td>
+      </tr>
+      <tr>
+        <td valign="top" width="144">Authors</td>
+        <td valign="top">Members of the Editorial Committee</td>
+      </tr>
+      <tr>
+        <td valign="top" width="144">Date</td>
+        <td valign="top">@date@</td>
+      </tr>
+      <tr>
+        <td valign="top" width="144">This Version</td>
+        <td valign="top"><a href="http://www.unicode.org/Public/3.2-Update/StandardizedVariants-@revision@.html">http://www.unicode.org/Public/3.2-Update/StandardizedVariants-@revision@.html</a></td>
+      </tr>
+      <tr>
+        <td valign="top" width="144">Previous Version</td>
+        <td valign="top"><a href="http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html">http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html</a></td>
+      </tr>
+      <tr>
+        <td valign="top" width="144">Latest Version</td>
+        <td valign="top"><a href="http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html">http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html</a></td>
+      </tr>
+    </tbody>
+  </table>
+  <h3><br>
+  <i>Summary</i></h3>
+  <blockquote>
+    <p>This file provides a visual display of the standard variant sequences 
+    derived from StandardizedVariants.txt.</p>
+  </blockquote>
+  <h3><i>Status</i></h3>
+  <blockquote>
+    <p><i>The file and the files described herein are part of the <a href="http://www.unicode.org/ucd">Unicode 
+    Character Database</a> (UCD) and are governed by the <a href="#Terms of Use">UCD 
+    Terms of Use</a> stated at the end.</i></p>
+  </blockquote>
+  <hr width="50%">
+  <h2>Introduction</h2>
+  <p>The tables here <i>exhaustively</i> lists the valid, registered 
+  combinations of base character plus variation indicator. All combinations not 
+  listed in StandardizedVariants.txt are unspecified and are reserved for future 
+  standardization; no conformant process may interpret them as standardized 
+  variants. Variation selectors and their use are described in The Unicode 
+  Standard.</p>
+  <p>These mathematical variants are all produced with the addition of Variation 
+  Selector 1 (VS1 or U+FE00) to mathematical operator base characters. There is 
+  no variation according to context. The Mongolian variants use the Mongolian 
+  Variant Selectors, and may vary according to context. That is, if a contextual 
+  shape is not listed below, then the variation sequence has an unmodified 
+  appearance. At this time no Han variants exist.</p>
+  <blockquote>
+    <p><a name="fonts"><b>Note: </b></a>The glyphs used to show the variations 
+    are often derived from different physical fonts than the representative 
+    glyphs in the standard. They may therefore exhibit minor differences in 
+    size, proportion, or weight <i>unrelated</i> to the intentional difference 
+    in feature that is the defining element of the variation. Such minor 
+    differences should be ignored. Likewise, in some cases the existing 
+    representative fonts may not yet contain newly encoded characters and hence 
+    some representative glyphs shown in these tables may have a slightly 
+    different style than others.</p>
+  </blockquote>
+  <p>@table@</p>
+  <hr width="50%">
+  <h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
+  <h3><i>Disclaimer</i></h3>
+  <blockquote>
+    <p><i>The Unicode Character Database is provided as is by Unicode, Inc. No 
+    claims are made as to fitness for any particular purpose. No warranties of 
+    any kind are expressed or implied. The recipient agrees to determine 
+    applicability of information provided. If this file has been purchased on 
+    magnetic or optical media from Unicode, Inc., the sole remedy for any claim 
+    will be exchange of defective media within 90 days of receipt.</i></p>
+    <p><i>This disclaimer is applicable for all other data files accompanying 
+    the Unicode Character Database, some of which have been compiled by the 
+    Unicode Consortium, and some of which have been supplied by other sources.</i></p>
+  </blockquote>
+  <h3><i>Limitations on Rights to Redistribute This Data</i></h3>
+  <blockquote>
+    <p><i>Recipient is granted the right to make copies in any form for internal 
+    distribution and to freely use the information supplied in the creation of 
+    products supporting the Unicode<sup>TM</sup> Standard. The files in the 
+    Unicode Character Database can be redistributed to third parties or other 
+    organizations (whether for profit or not) as long as this notice and the 
+    disclaimer notice are retained. Information can be extracted from these 
+    files and used in documentation or programs, as long as there is an 
+    accompanying notice indicating the source.</i></p>
+  </blockquote>
+  <hr width="50%">
+  <div align="center">
+    <center>
+    <table cellspacing="0" cellpadding="0" border="0">
+      <tr>
+        <td><a href="http://www.unicode.org/unicode/copyright.html"><img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
+      </tr>
+    </table>
+    <script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js"></script>
+    </center>
+  </div>
+</blockquote>
+
+</body>
+
+</html>
diff --git a/tools/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java b/tools/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java
new file mode 100644
index 00000000000..155ba809cf5
--- /dev/null
+++ b/tools/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java
@@ -0,0 +1,109 @@
+/**
+*******************************************************************************
+* Copyright (C) 1996-2001, International Business Machines Corporation and    *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java,v $
+* $Date: 2003/02/26 00:35:09 $
+* $Revision: 1.1 $
+*
+*******************************************************************************
+*/
+
+package com.ibm.text.UCD;
+
+import java.util.*;
+import java.io.*;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
+
+import com.ibm.text.utility.*;
+import com.ibm.icu.text.UnicodeSet;
+
+public class TestNameUniqueness implements UCD_Types {
+    
+    public static void test() throws IOException {
+        Default.setUCD();
+        new TestNameUniqueness().checkNames();
+    }
+    
+    Map names = new HashMap();
+    int[] charCount = new int[128];
+    int[] samples = new int[128];
+    
+    void checkNames() throws IOException {
+        PrintWriter out = Utility.openPrintWriter("name_uniqueness.txt", Utility.LATIN1_WINDOWS);
+        try {
+            out.println("Collisions");
+            out.println();
+            for (int cp = 0; cp < 0x10FFFF; ++cp) {
+                Utility.dot(cp);
+                if (!Default.ucd.isAllocated(cp)) continue;
+                if (Default.ucd.hasComputableName(cp)) continue;
+                int cat = Default.ucd.getCategory(cp);
+                if (cat == Cc) continue;
+                
+                String name = Default.ucd.getName(cp);
+                String processedName = processName(cp, name);
+                Integer existing = (Integer) names.get(processedName);
+                if (existing != null) {
+                    out.println("Collision between: "
+                        + Default.ucd.getCodeAndName(existing.intValue())
+                        + ", " + Default.ucd.getCodeAndName(cp));
+                } else {
+                    names.put(processedName, new Integer(cp));
+                }
+            }
+            out.println();
+            out.println("Samples");
+            out.println();
+            for (int i = 0; i < charCount.length; ++i) {
+                int count = charCount[i];
+                if (count == 0) continue;
+                String sampleName = Default.ucd.getCodeAndName(samples[i]);
+                out.println(count + "\t'" + ((char)i)
+                    + "'\t" + Default.ucd.getCodeAndName(samples[i])
+                    + "\t=>\t" + processName(samples[i], Default.ucd.getName(samples[i])));
+            }
+            out.println();
+            out.println("Name Samples");
+            out.println();
+            for (int i = 0; i < 256; ++i) {
+                int cat = Default.ucd.getCategory(i);
+                if (cat == Cc) continue;
+                out.println(Default.ucd.getCodeAndName(i)
+                    + "\t=>\t" + processName(i, Default.ucd.getName(i)));
+            }
+        } finally {
+            out.close();
+        }
+    }
+    
+    static final String[][] replacements = {
+        //{"SMALL LETTER", ""},
+        {"LETTER", ""},
+        {"CHARACTER", ""},
+        {"DIGIT", ""},
+        {"SIGN", ""},
+        //{"WITH", ""},
+    };
+    
+    StringBuffer processNamesBuffer = new StringBuffer();
+    
+    String processName(int codePoint, String name) {
+        name = Utility.replace(name, replacements);
+        processNamesBuffer.setLength(0);
+        for (int i = 0; i < name.length(); ++i) {
+            char c = name.charAt(i);
+            ++charCount[c];
+            if (samples[c] == 0) samples[c] = codePoint;
+            if ('A' <= c && c <= 'Z'
+                || '0' <= c && c <= '9') processNamesBuffer.append(c);
+            
+        }
+        if (processNamesBuffer.length() == name.length()) return name;
+        return processNamesBuffer.toString();
+    }
+}
+

Revision	@revision@
Authors	Members of the Editorial Committee
Date	@date@
This Version	http://www.unicode.org/Public/3.2-Update/StandardizedVariants-@revision@.html
Previous Version	http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html
Latest Version	http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html