Fixes for charts

X-SVN-Rev: 6470
This commit is contained in:
Mark Davis 2001-10-26 23:33:48 +00:00
parent f26268d1a5
commit d663f96a92
13 changed files with 353 additions and 127 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
* $Date: 2001/10/25 20:35:41 $
* $Revision: 1.5 $
* $Date: 2001/10/26 23:32:03 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -23,6 +23,7 @@ import java.io.IOException;
import com.ibm.text.UCD.Normalizer;
import com.ibm.text.UCD.UCD;
import com.ibm.text.utility.*;
import com.ibm.text.UTF16;
//import com.ibm.text.CollationData.*;
@ -116,7 +117,7 @@ final public class UCA implements Comparator {
* If the source is null, uses the normal Unicode data files, which
* need to be in BASE_DIR.
*/
public UCA(BufferedReader source) throws java.io.IOException {
public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException {
fullData = source == null;
// clear some tables
@ -125,9 +126,11 @@ final public class UCA implements Comparator {
}
// load the normalizer
if (toD == null) {
toD = new Normalizer(Normalizer.NFD);
toD = new Normalizer(Normalizer.NFD, unicodeVersion);
}
ucdVersion = UCD.make(unicodeVersion).getVersion();
// either get the full sources, or just a demo set
if (fullData) {
for (int i = 0; i < KEYS.length; ++i) {
@ -369,12 +372,19 @@ final public class UCA implements Comparator {
}
/**
* Retrieves version
* Retrieves versions
*/
public String getDataVersion() {
return dataVersion;
}
/**
* Retrieves versions
*/
public String getUCDVersion() {
return ucdVersion;
}
public static String codePointOrder(String s) {
return appendInCodePointOrder(s, new StringBuffer()).toString();
}
@ -556,7 +566,7 @@ final public class UCA implements Comparator {
public static int makeKey(int primary, int secondary, int tertiary) {
return (primary << 16) | (secondary << 7) | tertiary;
}
// =============================================================
// Utility methods
// =============================================================
@ -571,11 +581,11 @@ final public class UCA implements Comparator {
result.append("[");
for (int i = 0; i < sortKey.length(); ++i) {
char ch = sortKey.charAt(i);
if (needSep) result.append(" ");
if (ch == 0) {
result.append("|");
needSep = false;
needSep = true;
} else {
if (needSep) result.append(" ");
result.append(hex(ch));
needSep = true;
}
@ -699,6 +709,11 @@ final public class UCA implements Comparator {
*/
private String dataVersion = "?";
/**
* Records the dataversion
*/
private String ucdVersion = "?";
/**
* Turns backwards (e.g. for French) on globally for all secondaries
*/
@ -783,8 +798,8 @@ final public class UCA implements Comparator {
* There are at least 34 values, so that we can use a range for surrogates
* However, we do add to the first weight if we have surrogate pairs!
*/
static final int UNSUPPORTED_P = 0xFFC2;
static final int UNSUPPORTED = makeKey(UNSUPPORTED_P, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
public static final int UNSUPPORTED_BASE = 0xFFC2;
static final int UNSUPPORTED = makeKey(UNSUPPORTED_BASE, NEUTRAL_SECONDARY, NEUTRAL_TERTIARY);
// was 0xFFC20101;
@ -977,7 +992,7 @@ final public class UCA implements Comparator {
//return UNSUPPORTED + (bigChar & 0xFFFF0000); // top bits added
expandingStack.push(makeKey((bigChar & 0x7FFF) | 0x8000, 0, 0)); // primary = bottom 15 bits plus turn bottom bit on.
// secondary and tertiary are both zero
return makeKey(UNSUPPORTED_P + (bigChar >> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); // top 34 values plus UNSUPPORTED
return makeKey(UNSUPPORTED_BASE + (bigChar >>> 15), NEUTRAL_SECONDARY, NEUTRAL_TERTIARY); // top 34 values plus UNSUPPORTED
/*
expandingStack.push(((bigChar & 0x7FFF) << 16) | 0x10000000); // primary = bottom 15 bits plus turn bottom bit on.
// secondary and tertiary are both zero
@ -1148,6 +1163,11 @@ final public class UCA implements Comparator {
Normalizer skipDecomps = new Normalizer(Normalizer.NFD);
Iterator enum = null;
byte ceLimit;
int currentRange = Integer.MAX_VALUE; // set to ZERO to enable
int startOfRange = SAMPLE_RANGES[0][0];
int endOfRange = startOfRange;
int itemInRange = startOfRange;
int skip = 1;
/**
* use FIXED_CE as the limit
@ -1157,6 +1177,13 @@ final public class UCA implements Comparator {
this.skipDecomps = skipDecomps;
}
/**
* use FIXED_CE as the limit
*/
public void enableSamples() {
currentRange = 0;
}
/**
* returns a string
*/
@ -1176,12 +1203,35 @@ final public class UCA implements Comparator {
if (enum == null) enum = multiTable.keySet().iterator();
if (enum.hasNext()) {
result = (String)enum.next();
return result;
}
// extra samples
if (currentRange < SAMPLE_RANGES.length) {
try {
result = UTF16.valueOf(itemInRange);
} catch (RuntimeException e) {
System.out.println(Utility.hex(itemInRange));
throw e;
}
++itemInRange;
if (itemInRange > endOfRange) {
++currentRange;
if (currentRange < SAMPLE_RANGES.length) {
startOfRange = itemInRange = SAMPLE_RANGES[currentRange][0];
endOfRange = SAMPLE_RANGES[currentRange].length > 1
? SAMPLE_RANGES[currentRange][1]
: startOfRange;
skip = ((endOfRange - startOfRange) / 513);
}
} else if (itemInRange > startOfRange + 9 && itemInRange < endOfRange - 9 - skip) {
itemInRange += skip;
}
}
return result;
}
/**
* returns a string and its ces
*/
@ -1208,6 +1258,30 @@ final public class UCA implements Comparator {
}
}
static final int[][] SAMPLE_RANGES = {
{0x10000},
{0x10FFFF},
{0x0220},
{0xFFF0},
{0xD800},
{0xDFFF},
{0xFFFE},
{0xFFFF},
{0x10FFFE},
{0x10FFFF},
{0x3400, 0x4DB5},
{0x4E00, 0x9FA5},
{0xAC00, 0xD7A3},
{0xA000, 0xA48C},
{0xE000, 0xF8FF},
{0x20000, 0x2A6D6},
{0xE0000, 0xE00FF},
{0xF0000, 0xF00FD},
{0xFFF00, 0xFFFFD},
{0x100000, 0x1000FD},
{0x10FF00, 0x10FFFD},
};
/**
* Adds the collation elements from a file (or other stream) in the UCA format.
* Values will override any previous mappings.

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
* $Date: 2001/10/25 20:35:41 $
* $Revision: 1.2 $
* $Date: 2001/10/26 23:32:03 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -24,6 +24,8 @@ public class WriteCharts implements UCD_Types {
static UCD ucd;
static final byte UNSUPPORTED = 120;
static public void test(UCA uca) throws IOException {
uca.setAlternate(UCA.NON_IGNORABLE);
@ -33,6 +35,7 @@ public class WriteCharts implements UCD_Types {
Normalizer nfc = new Normalizer(Normalizer.NFC);
UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps
cc.enableSamples();
Set set = new TreeSet();
@ -46,9 +49,9 @@ public class WriteCharts implements UCD_Types {
Iterator it = set.iterator();
int oldScript = -999;
byte oldScript = -127;
int[] scriptCount = new int[LIMIT_SCRIPT];
int[] scriptCount = new int[128];
int counter = 0;
@ -66,13 +69,16 @@ public class WriteCharts implements UCD_Types {
Utility.copyTextFile("help.html", true, "CollationCharts\\help.html");
indexFile = Utility.openPrintWriter("CollationCharts\\index_list.html");
Utility.appendFile("index_header.html", true, indexFile);
/*
indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
indexFile.println("<title>UCA Default Collation Table</title>");
indexFile.println("<base target='main'>");
indexFile.println("<style><!-- p { font-size: 90% } --></style>");
indexFile.println("</head><body><h2 align='center'>UCA Default Collation Table</h2>");
indexFile.println("<p align='center'><a href = 'help.html'>Help</a>");
*/
while (it.hasNext()) {
Utility.dot(counter);
@ -83,8 +89,6 @@ public class WriteCharts implements UCD_Types {
int cp = UTF16.charAt(s,0);
byte script = ucd.getScript(cp);
if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT;
else if (script == INHERITED_SCRIPT) script = COMMON_SCRIPT;
// get first non-zero primary
int primary = sortKey.charAt(0);
@ -92,23 +96,29 @@ public class WriteCharts implements UCD_Types {
else if (primary == 0) script = -2;
else if (primary < variable) script = -1;
else if (primary < high) script = COMMON_SCRIPT;
else if (primary >= UCA.UNSUPPORTED_BASE) script = UNSUPPORTED;
if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT;
else if ((script == INHERITED_SCRIPT || script == COMMON_SCRIPT) && oldScript >= 0) script = oldScript;
if (script != oldScript
&& (oldScript < COMMON_SCRIPT || script != COMMON_SCRIPT && script != INHERITED_SCRIPT)) {
// && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT)
) {
closeFile(output);
output = null;
oldScript = script;
}
if (output == null) {
++scriptCount[script+3];
if (scriptCount[script+3] > 1) {
System.out.println("\t\tFAIL: " + scriptCount[script+3] + ", " +
ucd.getScriptID_fromIndex(script) + ", " + ucd.getCodeAndName(s));
getChunkName(script) + ", " + ucd.getCodeAndName(s));
}
output = openFile(scriptCount[script+3], script);
oldScript = script;
}
boolean firstPrimaryEquals = sortKey.charAt(0) == lastSortKey.charAt(0);
boolean firstPrimaryEquals = primary == lastSortKey.charAt(0);
int strength = uca.strengthDifference(sortKey, lastSortKey);
if (strength < 0) strength = -strength;
@ -125,41 +135,46 @@ public class WriteCharts implements UCD_Types {
String breaker = "";
if (columnCount > 10 || !firstPrimaryEquals) {
if (!firstPrimaryEquals) breaker = "</tr><tr>";
if (!firstPrimaryEquals || script == UNSUPPORTED) breaker = "</tr><tr>";
else breaker = "</tr><tr><td></td>"; // indent 1 cell
columnCount = 0;
}
String classname = primaryCount > 1 ? XCLASSNAME[strength] : CLASSNAME[strength];
output.println(breaker + classname + nfc.normalize(s)
+ "<br><tt>" + Utility.hex(s)
output.println(breaker + classname
+ " title='" + UCA.toString(sortKey) + "'>"
+ nfc.normalize(s)
+ "<br><tt>"
+ Utility.hex(s)
//+ "<br>" + script
//+ "<br>" + UCA.toString(sortKey)
+ "</tt></td>");
++columnCount;
}
closeFile(output);
indexFile.println("</body></html>");
indexFile.println("<hr><p>Last Modified: " + new Date());
indexFile.println("<br>UCA Version: " + uca.getDataVersion());
indexFile.println("<br>UCD Version: " + ucd.getVersion());
indexFile.println("</p></body></html>");
indexFile.close();
}
static final String[] CLASSNAME = {
"<td class='q'>",
"<td class='q'>",
"<td class='q'>",
"<td class='t'>",
"<td class='s'>",
"<td class='p'>"};
"<td class='q'",
"<td class='q'",
"<td class='q'",
"<td class='t'",
"<td class='s'",
"<td class='p'"};
static final String[] XCLASSNAME = {
"<td class='eq'>",
"<td class='eq'>",
"<td class='eq'>",
"<td class='et'>",
"<td class='es'>",
"<td class='ep'>"};
"<td class='eq'",
"<td class='eq'",
"<td class='eq'",
"<td class='et'",
"<td class='es'",
"<td class='ep'"};
static PrintWriter indexFile;
@ -188,6 +203,7 @@ public class WriteCharts implements UCD_Types {
else if (script == -2) return "IGNORABLE";
else if (script == -1) return "VARIABLE";
else if (script == HIRAGANA_SCRIPT) return "KATAKANA-HIRAGANA";
else if (script == UNSUPPORTED) return "UNSUPPORTED";
else return ucd.getScriptID_fromIndex(script);
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
* $Date: 2001/10/25 20:35:41 $
* $Revision: 1.5 $
* $Date: 2001/10/26 23:32:03 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -51,7 +51,7 @@ public class WriteCollationData implements UCD_Types {
public static void main(String args[]) throws Exception {
System.out.println("Building UCA");
collator = new UCA(null);
collator = new UCA(null, "");
System.out.println("Building UCD data");
ucd = UCD.make("");

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteHTMLCollation.java,v $
* $Date: 2001/08/31 00:20:39 $
* $Revision: 1.2 $
* $Date: 2001/10/26 23:32:03 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -48,7 +48,7 @@ public class WriteHTMLCollation implements UCD_Types {
checkFixes();
System.out.println("Building UCA");
collator = new UCA(null);
collator = new UCA(null, "");
System.out.println("Building UCD data (old)");
//UInfo.init();

View file

@ -42,7 +42,15 @@ characters. The characters are arranged in the following groups:</p>
<th align="left"><i>Letters</i></th>
<th class="x">According to script</th>
</tr>
<tr>
<th align="left"><i>Unsupported</i></th>
<th class="x">Not explicitly supported in this version of UCA; uses
code-point order</th>
</tr>
</table>
<p>Characters from large blocks, such as CJK-Ideographs, Hangul Syllables,
Private Use Area, etc. are represented by a sampling. Some unassigned code
points, non-characters and other edge cases are also added to the list.</p>
<p>The characters* within each group are arranged in cells. The color of the
cell indicates the strength of the difference between that character and the <i>previous</i>
character in the chart, as follows.</p>
@ -85,7 +93,10 @@ character in the chart, as follows.</p>
or no difference</th>
</tr>
</table>
&nbsp;
<p align="left">If tool-tips are enabled in your browser, then if you pause the
mouse over any cell, you will see a representation of the sort key. In this
representation, the separators between the weight levels are represented with
&quot;|&quot;.</p>
<table>
<tr>
<th>*</th>

View file

@ -6,7 +6,7 @@
<meta name="ProgId" content="FrontPage.Editor.Document">
</head>
<frameset rows="168,*">
<frameset rows="192,*">
<frame name="header" src="index_list.html" target="main" scrolling="auto">
<frame name="main" src="help.html" target="main" scrolling="auto">
<noframes>

View file

@ -0,0 +1,56 @@
<!doctype HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"><html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta http-equiv="Content-Language" content="en-us">
<meta name="VI60_defaultClientScript" content="JavaScript">
<meta name="GENERATOR" content="Microsoft FrontPage 4.0">
<meta name="keywords" content="Basic">
<meta name="ProgId" content="FrontPage.Editor.Document">
<title>UCA Default Collation Table</title>
<base target="main">
<style><!-- p { font-size: 90% }
--></style>
<link rel="stylesheet" type="text/css"
href="http://www.unicode.org/webscripts/standard_styles.css">
<script language="Javascript" type="text/javascript"
src="http://www.unicode.org/webscripts/commonHeader.js"></script>
</head>
<body>
<table cellspacing="0" cellpadding="0" width="100%" border="0">
<tbody>
<tr>
<td colspan="2">
<table cellspacing="0" cellpadding="0" width="100%" border="0">
<tbody>
<tr>
<td class="icon"><a href="http://www.unicode.org/"><img
alt="[Unicode]"
src="http://www.unicode.org/webscripts/logo60s2.gif"
align="middle" border="0" width="34" height="33"></a>&nbsp;&nbsp;<a
class="bar" href="http://www.unicode.org/charts"
target="_parent"><font size="3">Charts</font></a></td>
<td class="bar"><a class="bar" href="http://www.unicode.org"
target="_parent">Home</a> | <a class="bar"
href="http://www.unicode.org/sitemap/" target="_parent">Site Map</a>
| <a class="bar" href="http://www.unicode.org/search"
target="_parent">Search </a><script
language="Javascript"
src="http://www.unicode.org/webscripts/commonSearch.js"
type="text/javascript"></script>
<NOSCRIPT>
<a href="http://www.unicode.org/webscripts/quick_links.html"
class="bar" target="_blank">Goto</a></NOSCRIPT>
</td>
</tr>
</tbody>
</table>
</td>
</tr>
</tbody>
</table>
<!-- BEGIN CONTENTS -->
<h2 align="center">UCA Default Collation Table</h2>
<p align="center"><a href="help.html">Help</a>

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.5 $
* $Date: 2001/10/26 23:33:08 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -116,6 +116,7 @@ public class DerivedProperty implements UCD_Types {
int prop;
NF_UnsafeStartProp(int i) {
testStatus = true;
prop = i-NFD_UnsafeStart;
nfx = nf[prop];
name = NAME[prop] + "_UnsafeStart";
@ -143,6 +144,7 @@ public class DerivedProperty implements UCD_Types {
boolean keepNonZero = true;
NFC_Prop(int i) {
testStatus = true;
BitSet[] bitsets = new BitSet[3];
switch(i) {
case NFC_Leading: bitsets[0] = bitset = new BitSet(); break;
@ -238,11 +240,11 @@ public class DerivedProperty implements UCD_Types {
class CaseDProp extends DProp {
byte val;
CaseDProp (int i) {
testStatus = true;
val = (i == Missing_Uppercase ? Lu : i == Missing_Lowercase ? Ll : Lt);
name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase];
header = "# Derived Property: " + name
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
testStatus = true;
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
@ -451,10 +453,10 @@ of characters, the first of which has a non-zero combining class.
dprops[FullCompInclusion] = new DProp() {
{
testStatus = true;
name = "Full_Composition_Inclusion";
shortName = "Comp_In";
defaultStyle = SHORT;
testStatus = true;
header = "# Derived Property: " + name
+ ": Full Composition Inclusion"
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.7 $
* $Date: 2001/10/26 23:33:07 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -23,6 +23,8 @@ import com.ibm.text.UTF16;
public class GenerateData implements UCD_Types {
static final String HORIZONTAL_LINE = "# ================================================";
static UnifiedBinaryProperty ubp;
@ -44,7 +46,7 @@ public class GenerateData implements UCD_Types {
if (arg.equalsIgnoreCase("partition")) {
partitionProperties();
} else if (arg.equalsIgnoreCase("list")) {
} else if (arg.equalsIgnoreCase("PropertyAliases")) {
listProperties();
} else if (arg.equalsIgnoreCase("listAccents")) {
listCombiningAccents();
@ -225,7 +227,7 @@ public class GenerateData implements UCD_Types {
output.println("# Date: " + myDateFormat.format(new Date()) + " [MD]");
output.println("# Note: Unassigned and Noncharacter codepoints are omitted,");
output.println("# except when listing Noncharacter or Cn.");
output.println("# ================================================");
output.println(HORIZONTAL_LINE);
output.println();
}
@ -235,7 +237,7 @@ public class GenerateData implements UCD_Types {
for (int i = 0; i < DerivedProperty.LIMIT; ++i) {
if ((bitMask & (1L<<i)) == 0) continue;
System.out.print('.');
output.println("# ================================================");
output.println(HORIZONTAL_LINE);
output.println();
new DerivedPropertyLister(ucd, i, output).print();
output.flush();
@ -417,27 +419,28 @@ public class GenerateData implements UCD_Types {
Map duplicates = new TreeMap();
Set sorted = new TreeSet(java.text.Collator.getInstance());
Map accumulation = new TreeMap();
String spacing;
for(int k = 0; k < UCD_Names.NON_ENUMERATED.length; ++k) {
propAbb = UCD_Names.NON_ENUMERATED[k][0];
prop = UCD_Names.NON_ENUMERATED[k][1];
propAbb = fixGaps(UCD_Names.NON_ENUMERATED[k][0], false);
prop = fixGaps(UCD_Names.NON_ENUMERATED[k][1], true);
spacing = Utility.repeat(" ", 10-propAbb.length());
sorted.add("AA; " + propAbb + spacing + "; " + prop);
checkDuplicate(duplicates, propAbb, prop);
if (!prop.equals(propAbb)) checkDuplicate(duplicates, prop, prop);
checkDuplicate(duplicates, accumulation, propAbb, prop);
if (!prop.equals(propAbb)) checkDuplicate(duplicates, accumulation, prop, prop);
}
sorted.add("xx; T ; True");
checkDuplicate(duplicates, "T", "xx");
checkDuplicate(duplicates, accumulation, "T", "xx=True");
sorted.add("xx; F ; False");
checkDuplicate(duplicates, "F", "xx");
checkDuplicate(duplicates, accumulation, "F", "xx=False");
sorted.add("qc; Y ; Yes");
checkDuplicate(duplicates, "Y", "qc");
checkDuplicate(duplicates, accumulation, "Y", "qc=Yes");
sorted.add("qc; N ; No");
checkDuplicate(duplicates, "Y", "qc");
checkDuplicate(duplicates, accumulation, "N", "qc=No");
sorted.add("qc; M ; Maybe");
checkDuplicate(duplicates, "Y", "qc");
checkDuplicate(duplicates, accumulation, "M", "qc=Maybe");
for (int i = 0; i < LIMIT_ENUM; ++i) {
@ -446,12 +449,12 @@ public class GenerateData implements UCD_Types {
if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue;
if (type == i && type != BINARY_PROPERTIES && type != DERIVED) {
propAbb = ubp.getPropertyName(i, SHORT);
prop = ubp.getPropertyName(i, LONG);
propAbb = fixGaps(ubp.getPropertyName(i, SHORT), false);
prop = fixGaps(ubp.getPropertyName(i, LONG), true);
spacing = Utility.repeat(" ", 10-propAbb.length());
sorted.add("BB; " + propAbb + spacing + "; " + prop);
checkDuplicate(duplicates, propAbb, prop);
if (!prop.equals(propAbb)) checkDuplicate(duplicates, prop, prop);
checkDuplicate(duplicates, accumulation, propAbb, prop);
if (!prop.equals(propAbb)) checkDuplicate(duplicates, accumulation, prop, prop);
}
if (!ubp.isDefined(i)) continue;
@ -460,7 +463,7 @@ public class GenerateData implements UCD_Types {
String value = ubp.getID(i, LONG);
if (value.length() == 0) value = "none";
else if (value.equals("<unused>")) continue;
value = fixGaps(value);
value = fixGaps(value, true);
if (type == SCRIPT) {
value = ucd.getCase(value, FULL, TITLE);
@ -468,6 +471,7 @@ public class GenerateData implements UCD_Types {
String abbvalue = ubp.getID(i, SHORT);
if (abbvalue.length() == 0) abbvalue = "no";
abbvalue = fixGaps(abbvalue, false);
if (type == COMBINING_CLASS) {
if (value.startsWith("Fixed_")) { continue; }
@ -497,19 +501,32 @@ public class GenerateData implements UCD_Types {
if (type == BINARY_PROPERTIES || type == DERIVED) {
sorted.add("ZZ; " + abbvalue + spacing + "; " + value);
checkDuplicate(duplicates, value, value);
if (!value.equals(abbvalue)) checkDuplicate(duplicates, abbvalue, value);
checkDuplicate(duplicates, accumulation, value, value);
if (!value.equalsIgnoreCase(abbvalue)) checkDuplicate(duplicates, accumulation, abbvalue, value);
continue;
}
sorted.add(propAbb + "; " + abbvalue + spacing + "; " + value);
checkDuplicate(duplicates, value, prop + "=" + value);
if (!value.equals(abbvalue)) checkDuplicate(duplicates, abbvalue, prop + "=" + value);
checkDuplicate(duplicates, accumulation, value, prop + "=" + value);
if (!value.equalsIgnoreCase(abbvalue)) checkDuplicate(duplicates, accumulation, abbvalue, prop + "=" + value);
}
PrintWriter log = Utility.openPrintWriter("PropertyAliases.txt");
Utility.appendFile("PropertyAliasHeader.txt", log);
PrintWriter log = Utility.openPrintWriter("PropertyAliases-" + ucd.getVersion() + "dX.txt");
Utility.appendFile("PropertyAliasHeader.txt", false, log);
log.println("# Generated: " + new Date() + ", MD");
log.println(HORIZONTAL_LINE);
log.println();
Utility.print(log, sorted, "\r\n", new MyBreaker());
log.println();
log.println(HORIZONTAL_LINE);
log.println();
log.println("# Non-Unique names: the same name (under either an exact or loose match)");
log.println("# occurs as a property name or property value name");
log.println("# Note: no two property names can be the same,");
log.println("# nor can two property value names for the same property be the same.");
log.println();
Utility.print(log, accumulation.values(), "\r\n", new MyBreaker());
log.println();
log.close();
}
@ -525,20 +542,58 @@ public class GenerateData implements UCD_Types {
}
}
static void checkDuplicate(Map m, String toCheck, String comment) {
String result = (String) m.get(toCheck);
static void checkDuplicate(Map m, Map accumulation, String toCheck, String originalComment) {
toCheck = skeleton(toCheck);
String comment = "{" + originalComment + "}";
Set result = (Set) m.get(toCheck);
if (result != null) {
System.out.println("Collision with " + toCheck);
System.out.println(" Between " + comment);
System.out.println(" And " + result);
// Warn on serious problem: two property-names collide
// or two property names & values collide.
// examples:
// if (1) "c" stood for both "General_Category" and "Combining_Class"
// or if (2) "X=cc" stood for "X=control" and "X=compatibility"
// 1: comment doesn't contain "=", and something in the results doesn't contain "="
// 2: comment does contain "X=", and something else in results contains "X="
int equalPos = comment.indexOf('=');
if (equalPos < 0) { // #1
String conflict = Utility.findSubstring("=", result, false);
if (conflict != null) {
System.out.println("Property Name Conflict " + toCheck);
System.out.println(" With " + comment);
System.out.println(" And " + conflict);
}
} else { // #2
String trial = comment.substring(0,equalPos+1);
String conflict = Utility.findSubstring(trial, result, true);
if (conflict != null) {
System.out.println("Property Value Name Conflict " + toCheck);
System.out.println(" With " + comment);
System.out.println(" And " + conflict);
}
}
// accumulate differences
String acc = (String)accumulation.get(toCheck);
/*if (acc == null) {
acc = "# \"" + toCheck + "\":\t" + originalComment;
}
acc += ";\t" + result;
*/
result.add(comment);
accumulation.put(toCheck, "# \"" + toCheck + "\":\t" + result);
} else {
m.put(skeleton(toCheck), comment);
result = new TreeSet();
result.add(comment);
m.put(toCheck, result);
}
}
static String fixGaps(String source) {
static String fixGaps(String source, boolean titlecaseStart) {
StringBuffer result = new StringBuffer();
byte lastCat = -1;
boolean haveFirstCased = true;
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
if (c == ' ' || c == '-') c = '_';
@ -546,6 +601,12 @@ public class GenerateData implements UCD_Types {
if (lastCat == Ll && cat == Lu) {
result.append('_');
}
if (haveFirstCased && (cat == Ll || cat == Lt || cat == Lu)) {
if (titlecaseStart) {
c = ucd.getCase(c, SIMPLE, TITLE).charAt(0);
}
haveFirstCased = false;
}
result.append(c);
lastCat = cat;
}
@ -557,7 +618,7 @@ public class GenerateData implements UCD_Types {
source = source.toLowerCase();
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
if (c < 'a' || c > 'z') continue;
if (c == ' ' || c == '_' || c == '-') continue;
result.append(c);
}
return result.toString();
@ -596,15 +657,15 @@ public class GenerateData implements UCD_Types {
&& i < (AGE + NEXT_ENUM)) continue;
if ((last & 0xFF00) != (i & 0xFF00) && (i <= BINARY_PROPERTIES || i >= SCRIPT)) {
output.println();
output.println("# ================================================");
output.println(HORIZONTAL_LINE);
output.println("# " + UCD_Names.UNIFIED_PROPERTIES[i>>8]);
output.println("# ================================================");
output.println(HORIZONTAL_LINE);
output.println();
System.out.println();
System.out.println(UCD_Names.UNIFIED_PROPERTIES[i>>8]);
last = i;
} else {
output.println("# ================================================");
output.println(HORIZONTAL_LINE);
output.println();
}
System.out.print(".");
@ -612,9 +673,9 @@ public class GenerateData implements UCD_Types {
}
if (endEnum == LIMIT_ENUM) {
output.println();
output.println("# ================================================");
output.println(HORIZONTAL_LINE);
output.println("# Numeric Values (from UnicodeData.txt, field 6/7/8)");
output.println("# ================================================");
output.println(HORIZONTAL_LINE);
output.println();
System.out.println();
System.out.println("@NUMERIC VALUES");
@ -829,8 +890,7 @@ public class GenerateData implements UCD_Types {
PrintWriter log = Utility.openPrintWriter(filename + "dX.txt");
try {
log.println("# Derived file showing when various code points were designated in Unicode");
log.println("# author: M. Davis");
log.println("# generated: " + new Date());
log.println("# generated: " + new Date() + ", MD");
log.println("# Notes:");
log.println("# - The term 'designated' means that a previously reserved code point was specified");
log.println("# to be a noncharacter or surrogate, or assigned as a character,");
@ -842,22 +902,22 @@ public class GenerateData implements UCD_Types {
log.println("# were designated in version 2.0, but not specifically listed in the UCD");
log.println("# until versions 3.0 and 3.1 respectively.");
log.println("# ================================================");
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister(null, "1.1.0", log).print();
log.println("# ================================================");
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("1.1.0", "2.0.0", log).print();
log.println("# ================================================");
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("2.0.0", "2.1.2", log).print();
log.println("# ================================================");
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("2.1.2", "3.0.0", log).print();
log.println("# ================================================");
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("3.0.0", "3.1.0", log).print();
log.println("# ================================================");
log.println(HORIZONTAL_LINE);
log.println();
new DiffPropertyLister("3.1.0", "3.2.0", log).print();
/*

View file

@ -1,3 +1,4 @@
# DRAFT
# PropertyAliases-3.2.0.txt
#
# This file contains aliases for properties and property values used in the UCD.
@ -42,6 +43,3 @@
#
# The combination of property value and property name is, however, unique.
# For more information, see UTR #24: Regular Expression Guidelines
# ================================================

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
* $Date: 2001/10/26 23:33:07 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -315,7 +315,7 @@ public final class UCD implements UCD_Types {
if (caseType == TITLE) { // set the case type for the next character
// certain characters are ignored
if (cp == '-' || cp == SHY || cp == '\'' || cp == APOSTROPHE) continue;
if (cp == SHY || cp == '\'' || cp == APOSTROPHE) continue;
byte cat = getCategory(cp);
if (cat == Mn || cat == Me || cat == Cf || cat == Lm) continue;
if (dp.hasProperty(cp, DerivedProperty.DefaultIgnorable)) continue;
@ -572,7 +572,7 @@ public final class UCD implements UCD_Types {
case 230: s = style < LONG ? "A" : "Above"; break;
case 232: s = style < LONG ? "AR" : "AboveRight"; break;
case 233: s = style < LONG ? "DB" : "DoubleBelow"; break;
case 234: s = style < LONG ? "DB" : "DoubleAbove"; break;
case 234: s = style < LONG ? "DA" : "DoubleAbove"; break;
case 240: s = style < LONG ? "IS" : "IotaSubscript"; break;
default: s += "_" + index;
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
* $Date: 2001/10/26 23:33:07 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -54,7 +54,7 @@ final class UCD_Names implements UCD_Types {
static final String[] SHORT_UNIFIED_PROPERTIES = {
"GeneralCategory",
"CombiningClass",
"CanonicalCombiningClass",
"BidiClass",
"DecompositionType",
"NumericType",
@ -70,7 +70,7 @@ final class UCD_Names implements UCD_Types {
static final String[] ABB_UNIFIED_PROPERTIES = {
"gc",
"cc",
"ccc",
"bc",
"dt",
"nt",
@ -467,23 +467,23 @@ final class UCD_Names implements UCD_Types {
static String[] SHORT_DT = {
"", // NONE
"ca", // CANONICAL
"co", // Otherwise unspecified compatibility character.
"fo", // A font variant (e.g. a blackletter form).
"can", // CANONICAL
"com", // Otherwise unspecified compatibility character.
"font", // A font variant (e.g. a blackletter form).
"nb", // A no-break version of a space or hyphen.
"in", // // An initial presentation form (Arabic).
"me", // // A medial presentation form (Arabic).
"fi", // // A final presentation form (Arabic).
"is", // An isolated presentation form (Arabic).
"ci", // An encircled form.
"sp", // A superscript form.
"sb", // A subscript form.
"ve", // A vertical layout presentation form.
"wi", // A wide (or zenkaku) compatibility character.
"na", // A narrow (or hankaku) compatibility character.
"sm", // A small variant form (CNS compatibility).
"sq", // A CJK squared font variant.
"fr", // A vulgar fraction form.
"init", // // An initial presentation form (Arabic).
"med", // // A medial presentation form (Arabic).
"fin", // // A final presentation form (Arabic).
"iso", // An isolated presentation form (Arabic).
"enc", // An encircled form.
"sup", // A superscript form.
"sub", // A subscript form.
"vert", // A vertical layout presentation form.
"wide", // A wide (or zenkaku) compatibility character.
"nar", // A narrow (or hankaku) compatibility character.
"sml", // A small variant form (CNS compatibility).
"sqr", // A CJK squared font variant.
"fra", // A vulgar fraction form.
};
static private String[] MIRRORED_TABLE = {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2001/10/25 20:32:38 $
* $Revision: 1.5 $
* $Date: 2001/10/26 23:33:48 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -84,6 +84,15 @@ public final class Utility { // COMMON UTILITIES
return -1;
}
public static String findSubstring(String source, Set target, boolean invert) {
Iterator it = target.iterator();
while (it.hasNext()) {
String other = it.next().toString();
if ((other.indexOf(source) >= 0) == invert) return other;
}
return null;
}
public static byte lookup(String source, String[] target) {
int result = Utility.find(source, target);
if (result != -1) return (byte)result;