minor additions to change PropertyAlias.txt

X-SVN-Rev: 6812
This commit is contained in:
Mark Davis 2001-11-13 02:31:55 +00:00
parent 9565246f34
commit 3405bab3d1
6 changed files with 190 additions and 65 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2001/10/31 00:02:27 $
* $Revision: 1.9 $
* $Date: 2001/11/13 02:31:55 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -424,6 +424,7 @@ public class GenerateData implements UCD_Types {
Set accumulation = new TreeSet(java.text.Collator.getInstance());
String spacing;
/*
BufferedReader blocks = Utility.openUnicodeFile("Blocks", ucd.getVersion());
String[] parts = new String[10];
while (true) {
@ -442,6 +443,7 @@ public class GenerateData implements UCD_Types {
checkDuplicate(duplicates, accumulation, value, "Block=" + value);
}
blocks.close();
*/
for (int k = 0; k < UCD_Names.NON_ENUMERATED.length; ++k) {
propAbb = fixGaps(UCD_Names.NON_ENUMERATED[k][0], false);
@ -456,15 +458,19 @@ public class GenerateData implements UCD_Types {
valueAbb = fixGaps(UCD_Names.SUPER_CATEGORIES[k][0], false);
value = fixGaps(UCD_Names.SUPER_CATEGORIES[k][1], true);
spacing = Utility.repeat(" ", 10-valueAbb.length());
sorted.add("gc; " + valueAbb + spacing + "; " + value);
String baseLine = "gc; " + valueAbb + spacing + "; " + value;
spacing = Utility.repeat(" ", 50-baseLine.length());
sorted.add(baseLine + spacing + "# " + UCD_Names.SUPER_CATEGORIES[k][2]);
checkDuplicate(duplicates, accumulation, value, "General_Category=" + value);
if (!value.equals(valueAbb)) checkDuplicate(duplicates, accumulation, valueAbb, "General_Category=" + value);
}
/*
sorted.add("xx; T ; True");
checkDuplicate(duplicates, accumulation, "T", "xx=True");
sorted.add("xx; F ; False");
checkDuplicate(duplicates, accumulation, "F", "xx=False");
*/
sorted.add("qc; Y ; Yes");
checkDuplicate(duplicates, accumulation, "Y", "qc=Yes");
sorted.add("qc; N ; No");
@ -507,6 +513,10 @@ public class GenerateData implements UCD_Types {
if (value.startsWith("Fixed_")) { continue; }
}
if (type == JOINING_GROUP) {
valueAbb = "n/a";
}
/*
String elide = "";
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
@ -546,7 +556,18 @@ public class GenerateData implements UCD_Types {
log.println("# Generated: " + new Date() + ", MD");
log.println(HORIZONTAL_LINE);
log.println();
Utility.print(log, sorted, "\r\n", new MyBreaker());
Utility.print(log, sorted, "\r\n", new MyBreaker(true));
log.close();
log = Utility.openPrintWriter("PropertyValueAliases-" + ucd.getVersion() + "dX.txt");
Utility.appendFile("PropertyValueAliasHeader.txt", false, log);
log.println("# Generated: " + new Date() + ", MD");
log.println(HORIZONTAL_LINE);
log.println();
Utility.print(log, sorted, "\r\n", new MyBreaker(false));
log.close();
log = Utility.openPrintWriter("PropertyAliasSummary-" + ucd.getVersion() + "dX.txt");
log.println();
log.println(HORIZONTAL_LINE);
log.println();
@ -555,20 +576,43 @@ public class GenerateData implements UCD_Types {
log.println("# Note: no two property names can be the same,");
log.println("# nor can two property value names for the same property be the same.");
log.println();
Utility.print(log, accumulation, "\r\n", new MyBreaker());
Utility.print(log, accumulation, "\r\n", new MyBreaker(false));
log.println();
log.close();
}
static class MyBreaker implements Utility.Breaker {
boolean status;
public MyBreaker(boolean status) {
this.status = status;
}
public boolean filter(Object current) {
String c = current.toString();
if (c.startsWith("AA") || c.startsWith("BB") || c.startsWith("ZZ")) return status;
return !status;
}
public String get(Object current, Object old) {
if (old == null) return "";
if (old == null) {
old = " ";
}
String c = current.toString();
String o = old.toString();
if (c.length() >= 2 && o.length() >= 0 && !c.substring(0,2).equals(o.substring(0,2))) {
return "\r\n";
String sep = "";
if (!c.substring(0,2).equals(o.substring(0,2))) {
sep = "\r\n";
if (status) {
if (c.startsWith("AA")) sep = sep + HORIZONTAL_LINE + sep + "# Non-enumerated Properties" + sep + HORIZONTAL_LINE + sep;
if (c.startsWith("BB")) sep = sep + HORIZONTAL_LINE + sep + "# Enumerated Non-Binary Properties" + sep + HORIZONTAL_LINE + sep;
if (c.startsWith("ZZ")) sep = sep + HORIZONTAL_LINE + sep + "# Binary Properties" + sep + HORIZONTAL_LINE + sep;
}
}
return "";
if (status) {
c = c.substring(4);
}
return sep + c;
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
* $Date: 2001/11/13 02:31:55 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -41,7 +41,8 @@ public final class Main {
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
else if (arg.equalsIgnoreCase("checkCase2")) VerifyUCD.checkCase2();
else if (arg.equalsIgnoreCase("checkCaseLong")) VerifyUCD.checkCase2(true);
else if (arg.equalsIgnoreCase("checkCaseShort")) VerifyUCD.checkCase2(false);
else if (arg.equalsIgnoreCase("checkCanonicalProperties")) VerifyUCD.checkCanonicalProperties();
else if (arg.equalsIgnoreCase("CheckCaseFold")) VerifyUCD.CheckCaseFold();
else if (arg.equalsIgnoreCase("idn")) VerifyUCD.VerifyIDN();

View file

@ -1,46 +1,30 @@
# DRAFT
# PropertyAliases-3.2.0.txt
#
# This file contains aliases for properties and property values used in the UCD.
# This file contains aliases for properties used in the UCD.
# These names can be used for XML formats of UCD data, for regular-expression
# property tests, and other programmatic textual descriptions of Unicode data.
# The names are not normative, except where they correspond to normative values
# in the UCD.
# The names are not normative, except where they correspond to normative
# properties in the UCD. For information on which properties are normative,
# see UnicodeCharacterDatabase.html.
#
# The names may be translated in appropriate environments, and additional
# aliases may be useful.
#
# FORMAT
#
# Each line has three fields, separated by semicolons.
# Each line has two fields, separated by semicolons.
#
# First Field: Where the first field is AA, BB, or ZZ, then the line describes a property name:
# First Field: The first field is an abbreviated name for the property
#
# AA - non-enumerated properties
# BB - enumerated, non-binary properties
# ZZ - binary properties and quick-check properties
#
# (The values AA, BB, and ZZ are arbitrary -- they were simply chosen to distinguish
# the different types.)
#
# Where the first field is not one of the above, the line describes a
# property value name. The first field describes the property for which that
# property value name is used. There are two special properties:
#
# xx stands for any binary property
# qc stands for any quick-check property
#
# Second Field: The second field is an abbreviated name.
# If there is no abbreviated name available, the field is marked with "n/a".
#
# Third Field: The third field is a long name.
# Second Field: The second field is a long name
#
# With loose matching of property names, the case distinctions, whitespace,
# and '_' are ignored.
#
# NOTE: Currently there is at most one abbreviated name and one long name for
# each property and property value. However, in the future additional aliases
# may be added. In such a case, the first line for the property or property value
# each property. However, in the future additional aliases
# may be added. In such a case, the first line for the property
# would have the preferred alias for output.
#
# NOTE: The property value names are NOT unique across properties, especially
@ -53,7 +37,5 @@
# cc means Combining_Class property, and
# cc means the General_Category property value Control (cc)
#
# Comments at the end of the file show cases of non-unique names.
#
# The combination of property value and property name is, however, unique.
# For more information, see UTR #24: Regular Expression Guidelines

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2001/10/31 00:02:27 $
* $Revision: 1.6 $
* $Date: 2001/11/13 02:31:55 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -32,7 +32,10 @@ final class UCD_Names implements UCD_Types {
{"stc", "Simple_Titlecase_Mapping"},
{"sfc", "Simple_Case_Folding"},
{"scc", "Special_Case_Condition"},
{"blk", "Block"}
{"blk", "Block"},
{"na1", "Unicode_1_Name"},
{"isc", "ISO_Comment"},
{"age", "Age"},
};
static final String[] UNIFIED_PROPERTIES = {
@ -406,13 +409,14 @@ final class UCD_Names implements UCD_Types {
};
static final String[][] SUPER_CATEGORIES = {
{"L", "Letter"},
{"M", "Mark"},
{"N", "Number"},
{"Z", "Separator"},
{"C", "Other"},
{"S", "Symbol"},
{"P", "Punctuation"},
{"L", "Letter", "Ll | Lm | Lo | Lt | Lu"},
{"M", "Mark", "Mc | Me | Mn"},
{"N", "Number", "Nd | Nl | No"},
{"Z", "Separator", "Zl | Zp | Zs"},
{"C", "Other", "Cc | Cf | Cn | Co | Cs"},
{"S", "Symbol", "Sc | Sk | Sm | So"},
{"P", "Punctuation", "Pc | Pd | Pe | Pf | Pi | Po | Ps"},
{"Lc", "Cased Letter", "Ll | Lt | Lu"},
};

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.6 $
* $Date: 2001/11/13 02:31:55 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -141,43 +141,109 @@ public class VerifyUCD implements UCD_Types {
log.close();
}
public static void checkCase2() throws IOException {
public static void checkCase2(boolean longForm) throws IOException {
Utility.fixDot();
System.out.println("checkCase");
ucd = UCD.make(Main.ucdVersion);
initNormalizers();
System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
/*String tx1 = "\u0391\u0342\u0345";
String ux1 = "\u0391\u0342\u0399";
String ctx1 = nfc.normalize(tx1);
String ctx2 = nfc.normalize(ux1); // wrong??
//System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
*/
String fileName = "CaseNormalizationDifferences.txt";
PrintWriter log = Utility.openPrintWriter(fileName);
log.println("Differences between case(normalize(cp)) and normalize(case(cp))");
log.println("u, l, t - upper, lower, title");
log.println("c, d - nfc, nfd");
//Utility.DOTMASK = 0x7F;
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
Utility.dot(cp);
if (!ucd.isRepresented(cp) || ucd.isPUA(cp)) continue;
if (cp == '\u3371') {
if (cp == '\u0130') {
System.out.println("debug");
}
String x = UTF32.valueOf32(cp);
String dx = nfd.normalize(cp);
String cx = nfc.normalize(cp);
String ux = ucd.getCase(x, FULL, UPPER);
String lx = ucd.getCase(x, FULL, LOWER);
String tx = ucd.getCase(x, FULL, TITLE);
String dux = nfd.normalize(ux);
String dlx = nfd.normalize(lx);
String dtx = nfd.normalize(tx);
if (x.equals(dx) && dx.equals(cx) && cx.equals(ux) && ux.equals(lx) && lx.equals(tx)) continue;
String cux = nfc.normalize(ux);
String clx = nfc.normalize(lx);
String ctx = nfc.normalize(tx);
if (x.equals(cx)) {
boolean needBreak = false;
if (!clx.equals(lx)) needBreak = true;
if (!ctx.equals(tx)) needBreak = true;
if (!cux.equals(ux)) needBreak = true;
if (needBreak) {
log.println("# Was not NFC:");
log.println(
"## " + Utility.hex(x) + "; "
+ Utility.hex(lx) + "; "
+ Utility.hex(tx) + "; "
+ Utility.hex(ux) + "; # "
+ ucd.getName(x));
log.println("# should be:");
log.println(
Utility.hex(x) + "; "
+ Utility.hex(clx) + "; "
+ Utility.hex(ctx) + "; "
+ Utility.hex(cux) + "; # "
+ ucd.getName(x));
log.println();
}
}
String dux = nfd.normalize(ux);
String dlx = nfd.normalize(lx);
String dtx = nfd.normalize(tx);
String startdx = getMarks(dx, false);
String enddx = getMarks(dx, true);
String dx = nfd.normalize(cp);
String cx = nfc.normalize(cp);
String startdux = getMarks(dux, false);
String enddux = getMarks(dux, true);
String startdtx = getMarks(dtx, false);
String enddtx = getMarks(dtx, true);
String startdlx = getMarks(dlx, false);
String enddlx = getMarks(dlx, true);
// If the new marks don't occur in the old decomposition, we got a problem!
if (!startdx.startsWith(startdux) || !startdx.startsWith(startdtx) || !startdx.startsWith(startdlx)
|| !enddx.endsWith(enddux) || !enddx.endsWith(enddtx) || !enddx.endsWith(enddlx)) {
log.println("Combining Class Difference for " + ucd.getCodeAndName(x));
log.println("x: " + ucd.getCodeAndName(dx) + ", " + Utility.hex(startdx) + ", " + Utility.hex(enddx));
log.println("ux: " + ucd.getCodeAndName(dux) + ", " + Utility.hex(startdux) + ", " + Utility.hex(enddux));
log.println("tx: " + ucd.getCodeAndName(dtx) + ", " + Utility.hex(startdtx) + ", " + Utility.hex(enddtx));
log.println("lx: " + ucd.getCodeAndName(dlx) + ", " + Utility.hex(startdlx) + ", " + Utility.hex(enddlx));
log.println();
}
if (!longForm) continue;
String udx = ucd.getCase(dx, FULL, UPPER);
String ldx = ucd.getCase(dx, FULL, LOWER);
String tdx = ucd.getCase(dx, FULL, TITLE);
@ -286,6 +352,28 @@ public class VerifyUCD implements UCD_Types {
log.close();
}
public static String getMarks(String s, boolean doEnd) {
int cp;
if (!doEnd) {
for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i);
int cc = ucd.getCombiningClass(cp);
if (cc == 0) {
return s.substring(0, i);
}
}
} else {
for (int i = s.length(); i > 0; i -= UTF16.getCharCount(cp)) {
cp = UTF16.charAt(s, i-1); // will go 2 before if necessary
int cc = ucd.getCombiningClass(cp);
if (cc == 0) {
return s.substring(i);
}
}
}
return s;
}
static final String names[] = {"LOWER", "TITLE", "UPPER", "(UNC)", "MIXED"};
static final String lowerNames[] = {"", "Other_Lower"};

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2001/10/31 00:02:54 $
* $Revision: 1.7 $
* $Date: 2001/11/13 02:31:34 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -30,9 +30,11 @@ public final class Utility { // COMMON UTILITIES
}
private static boolean needCRLF = false;
public static int DOTMASK = 0x7FF;
public static void dot(int i) {
if ((i % 0x7FF) == 0) {
if ((i % DOTMASK) == 0) {
needCRLF = true;
System.out.print('.');
}
@ -458,6 +460,7 @@ public final class Utility { // COMMON UTILITIES
public interface Breaker {
public String get(Object current, Object old);
public boolean filter(Object current); // true is keep
}
public static void print(PrintWriter pw, Collection c, String separator, Breaker b) {
@ -466,14 +469,17 @@ public final class Utility { // COMMON UTILITIES
Object last = null;
while (it.hasNext()) {
Object obj = it.next();
if (b != null && !b.filter(obj)) continue;
if (first) {
first = false;
} else {
pw.print(separator);
}
else pw.print(separator);
if (b != null) {
pw.print(b.get(obj, last));
} else {
pw.print(obj);
}
pw.print(obj);
last = obj;
}
}