updated for 4.0

X-SVN-Rev: 11164
This commit is contained in:
Mark Davis 2003-02-26 00:35:09 +00:00
parent 76aa91b7db
commit 9c6b10e2dc
3 changed files with 359 additions and 0 deletions

View file

@ -0,0 +1,113 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateStandardizedVariants.java,v $
* $Date: 2003/02/26 00:35:09 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
import java.util.*;
import java.io.*;
public final class GenerateStandardizedVariants implements UCD_Types {
static public String showVarGlyphs(String code0, String code1, String shape) {
System.out.println(code0 + ", " + code1 + ", [" + shape + "]");
String abbShape = "";
if (shape.length() != 0) {
abbShape = '-' + shape.substring(0,4);
if (shape.endsWith("-feminine")) abbShape += "fem";
}
return "<img alt='U+" + code0 + "+U+" + code1 + "/" + shape
+ "' src='http://www.unicode.org/cgi-bin/varglyph?24-" +code0 + "-" + code1 + abbShape + "'>";
}
/*
# Field 0: the variation sequence
# Field 1: the description of the desired appearance
# Field 2: where the appearance is only different in in particular shaping environments
# this field lists them. The possible values are: isolated, initial, medial, final.
# If more than one is present, there are spaces between them.
*/
static public void generate() throws IOException {
Default.setUCD();
// read the data and compose the table
String table = "<table><tr><th>Rep Glyph</th><th>Character Sequence</th><th>Context</th><th width='10%'>Alt Glyph</th><th>Description of variant appearance</th></tr>";
String[] splits = new String[4];
String[] codes = new String[2];
String[] shapes = new String[4];
BufferedReader in = Utility.openUnicodeFile("StandardizedVariants", Default.ucdVersion, true, Utility.LATIN1);
while (true) {
String line = Utility.readDataLine(in);
if (line == null) break;
if (line.length() == 0) continue;
int count = Utility.split(line, ';', splits);
int codeCount = Utility.split(splits[0], ' ', codes);
int code = Utility.codePointFromHex(codes[0]);
// <img alt="03E2" src="http://www.unicode.org/cgi-bin/refglyph?24-03E2" style="vertical-align:middle">
table += "<tr><td><img alt='U+" + codes[0] + "' src='http://www.unicode.org/cgi-bin/refglyph?24-" + codes[0] + "'></td>\n";
table += "<td>" + splits[0] + "</td>\n";
String shape = splits[2].trim();
if (shape.equals("all")) shape = "";
table += "<td>" + Utility.replace(shape, " ", "<br>") + "</td>\n";
// http://www.unicode.org/cgi-bin/varglyph?24-1820-180B-fina
// http://www.unicode.org/cgi-bin/varglyph?24-222A-FE00
table += "<td>";
if (shape.length() == 0) {
table += showVarGlyphs(codes[0], codes[1], "");
} else {
int shapeCount = Utility.split(shape, ' ', shapes);
for (int i = 0; i < shapeCount; ++i) {
if (i != 0) table += " ";
table += showVarGlyphs(codes[0], codes[1], shapes[i]);
}
}
table += "</td>\n";
table += "<td>" + Default.ucd.getName(code) + " " + splits[1] + "</td>\n";
table += "</tr>";
}
in.close();
table += "</table>";
// now write out the results
String directory = "DerivedData/";
String filename = directory + "StandardizedVariants.html";
PrintWriter out = Utility.openPrintWriter(filename, Utility.LATIN1_UNIX);
String[] batName = {""};
String mostRecent = GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true), batName);
String[] replacementList = {
"@revision@", Default.ucd.getVersion(),
"@date@", Default.getDate(),
"@table@", table};
Utility.appendFile("StandardizedVariants-Template.html", Utility.UTF8, out, replacementList);
out.close();
Utility.renameIdentical(mostRecent, Utility.getOutputName(filename), batName[0]);
}
}

View file

@ -0,0 +1,137 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
"http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="ProgId" content="FrontPage.Editor.Document">
<meta name="keywords" content="unicode, variant glyphs">
<meta name="description" content="Describes and displays standardized variant glyphs">
<title>Standardized Variants</title>
<link rel="stylesheet" type="text/css" href="http://www.unicode.org/reports/reports.css">
</head>
<body bgcolor="#ffffff">
<table class="header">
<tr>
<td class="icon"><a href="http://www.unicode.org"><img align="middle" alt="[Unicode]" border="0" src="http://www.unicode.org/webscripts/logo60s2.gif" width="34" height="33"></a>&nbsp;&nbsp;<a class="bar" href="UnicodeCharacterDatabase.html">Unicode
Character Database</a></td>
</tr>
<tr>
<td class="gray">&nbsp;</td>
</tr>
</table>
<blockquote>
<h1>Standardized Variants</h1>
<table class="wide">
<tbody>
<tr>
<td valign="top" width="144">Revision</td>
<td valign="top">@revision@</td>
</tr>
<tr>
<td valign="top" width="144">Authors</td>
<td valign="top">Members of the Editorial Committee</td>
</tr>
<tr>
<td valign="top" width="144">Date</td>
<td valign="top">@date@</td>
</tr>
<tr>
<td valign="top" width="144">This Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/3.2-Update/StandardizedVariants-@revision@.html">http://www.unicode.org/Public/3.2-Update/StandardizedVariants-@revision@.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Previous Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html">http://www.unicode.org/Public/3.2-Update/StandardizedVariants-3.2.0.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Latest Version</td>
<td valign="top"><a href="http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html">http://www.unicode.org/Public/UNIDATA/StandardizedVariants.html</a></td>
</tr>
</tbody>
</table>
<h3><br>
<i>Summary</i></h3>
<blockquote>
<p>This file provides a visual display of the standard variant sequences
derived from StandardizedVariants.txt.</p>
</blockquote>
<h3><i>Status</i></h3>
<blockquote>
<p><i>The file and the files described herein are part of the <a href="http://www.unicode.org/ucd">Unicode
Character Database</a> (UCD) and are governed by the <a href="#Terms of Use">UCD
Terms of Use</a> stated at the end.</i></p>
</blockquote>
<hr width="50%">
<h2>Introduction</h2>
<p>The tables here <i>exhaustively</i> lists the valid, registered
combinations of base character plus variation indicator. All combinations not
listed in StandardizedVariants.txt are unspecified and are reserved for future
standardization; no conformant process may interpret them as standardized
variants. Variation selectors and their use are described in The Unicode
Standard.</p>
<p>These mathematical variants are all produced with the addition of Variation
Selector 1 (VS1 or U+FE00) to mathematical operator base characters. There is
no variation according to context. The Mongolian variants use the Mongolian
Variant Selectors, and may vary according to context. That is, if a contextual
shape is not listed below, then the variation sequence has an unmodified
appearance. At this time no Han variants exist.</p>
<blockquote>
<p><a name="fonts"><b>Note: </b></a>The glyphs used to show the variations
are often derived from different physical fonts than the representative
glyphs in the standard. They may therefore exhibit minor differences in
size, proportion, or weight <i>unrelated</i> to the intentional difference
in feature that is the defining element of the variation. Such minor
differences should be ignored. Likewise, in some cases the existing
representative fonts may not yet contain newly encoded characters and hence
some representative glyphs shown in these tables may have a slightly
different style than others.</p>
</blockquote>
<p>@table@</p>
<hr width="50%">
<h2>UCD <a name="Terms of Use">Terms of Use</a></h2>
<h3><i>Disclaimer</i></h3>
<blockquote>
<p><i>The Unicode Character Database is provided as is by Unicode, Inc. No
claims are made as to fitness for any particular purpose. No warranties of
any kind are expressed or implied. The recipient agrees to determine
applicability of information provided. If this file has been purchased on
magnetic or optical media from Unicode, Inc., the sole remedy for any claim
will be exchange of defective media within 90 days of receipt.</i></p>
<p><i>This disclaimer is applicable for all other data files accompanying
the Unicode Character Database, some of which have been compiled by the
Unicode Consortium, and some of which have been supplied by other sources.</i></p>
</blockquote>
<h3><i>Limitations on Rights to Redistribute This Data</i></h3>
<blockquote>
<p><i>Recipient is granted the right to make copies in any form for internal
distribution and to freely use the information supplied in the creation of
products supporting the Unicode<sup>TM</sup> Standard. The files in the
Unicode Character Database can be redistributed to third parties or other
organizations (whether for profit or not) as long as this notice and the
disclaimer notice are retained. Information can be extracted from these
files and used in documentation or programs, as long as there is an
accompanying notice indicating the source.</i></p>
</blockquote>
<hr width="50%">
<div align="center">
<center>
<table cellspacing="0" cellpadding="0" border="0">
<tr>
<td><a href="http://www.unicode.org/unicode/copyright.html"><img src="http://www.unicode.org/img/hb_notice.gif" border="0" alt="Access to Copyright and terms of use" width="216" height="50"></a></td>
</tr>
</table>
<script language="Javascript" type="text/javascript" src="http://www.unicode.org/webscripts/lastModified.js"></script>
</center>
</div>
</blockquote>
</body>
</html>

View file

@ -0,0 +1,109 @@
/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNameUniqueness.java,v $
* $Date: 2003/02/26 00:35:09 $
* $Revision: 1.1 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UnicodeSet;
public class TestNameUniqueness implements UCD_Types {
public static void test() throws IOException {
Default.setUCD();
new TestNameUniqueness().checkNames();
}
Map names = new HashMap();
int[] charCount = new int[128];
int[] samples = new int[128];
void checkNames() throws IOException {
PrintWriter out = Utility.openPrintWriter("name_uniqueness.txt", Utility.LATIN1_WINDOWS);
try {
out.println("Collisions");
out.println();
for (int cp = 0; cp < 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!Default.ucd.isAllocated(cp)) continue;
if (Default.ucd.hasComputableName(cp)) continue;
int cat = Default.ucd.getCategory(cp);
if (cat == Cc) continue;
String name = Default.ucd.getName(cp);
String processedName = processName(cp, name);
Integer existing = (Integer) names.get(processedName);
if (existing != null) {
out.println("Collision between: "
+ Default.ucd.getCodeAndName(existing.intValue())
+ ", " + Default.ucd.getCodeAndName(cp));
} else {
names.put(processedName, new Integer(cp));
}
}
out.println();
out.println("Samples");
out.println();
for (int i = 0; i < charCount.length; ++i) {
int count = charCount[i];
if (count == 0) continue;
String sampleName = Default.ucd.getCodeAndName(samples[i]);
out.println(count + "\t'" + ((char)i)
+ "'\t" + Default.ucd.getCodeAndName(samples[i])
+ "\t=>\t" + processName(samples[i], Default.ucd.getName(samples[i])));
}
out.println();
out.println("Name Samples");
out.println();
for (int i = 0; i < 256; ++i) {
int cat = Default.ucd.getCategory(i);
if (cat == Cc) continue;
out.println(Default.ucd.getCodeAndName(i)
+ "\t=>\t" + processName(i, Default.ucd.getName(i)));
}
} finally {
out.close();
}
}
static final String[][] replacements = {
//{"SMALL LETTER", ""},
{"LETTER", ""},
{"CHARACTER", ""},
{"DIGIT", ""},
{"SIGN", ""},
//{"WITH", ""},
};
StringBuffer processNamesBuffer = new StringBuffer();
String processName(int codePoint, String name) {
name = Utility.replace(name, replacements);
processNamesBuffer.setLength(0);
for (int i = 0; i < name.length(); ++i) {
char c = name.charAt(i);
++charCount[c];
if (samples[c] == 0) samples[c] = codePoint;
if ('A' <= c && c <= 'Z'
|| '0' <= c && c <= '9') processNamesBuffer.append(c);
}
if (processNamesBuffer.length() == name.length()) return name;
return processNamesBuffer.toString();
}
}