mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-21 12:40:02 +00:00
Revisions for specialcasing & misc fixes
X-SVN-Rev: 7349
This commit is contained in:
parent
ac085286cd
commit
a903b84867
26 changed files with 1742 additions and 814 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2001/12/13 23:35:54 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -26,12 +26,9 @@ public class BuildNames implements UCD_Types {
|
|||
|
||||
static final boolean DEBUG = true;
|
||||
|
||||
static UCD ucd;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
|
||||
ucd = UCD.make();
|
||||
|
||||
Main.setUCD();
|
||||
collectWords();
|
||||
}
|
||||
|
||||
|
@ -85,8 +82,8 @@ public class BuildNames implements UCD_Types {
|
|||
int used = 0;
|
||||
int sum = 0;
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
if (ucd.hasComputableName(i)) continue;
|
||||
String name = transform(ucd.getName(i));
|
||||
if (Main.ucd.hasComputableName(i)) continue;
|
||||
String name = transform(Main.ucd.getName(i));
|
||||
|
||||
|
||||
sum += name.length();
|
||||
|
|
41
tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
Normal file
41
tools/unicodetools/com/ibm/text/UCD/CaseFoldingHeader.txt
Normal file
|
@ -0,0 +1,41 @@
|
|||
# Case Folding Properties
|
||||
#
|
||||
# This file is a supplement to the UnicodeData file.
|
||||
# It provides a case folding mapping generated from the Unicode Character Database.
|
||||
# If all characters are mapped according to the full mapping below, then
|
||||
# case differences (according to UnicodeData.txt and SpecialCasing.txt)
|
||||
# are eliminated.
|
||||
#
|
||||
# The data supports both implementations that require simple case foldings
|
||||
# (where string lengths don't change), and implementations that allow full case folding
|
||||
# (where string lengths may grow). Note that where they can be supported, the
|
||||
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
|
||||
#
|
||||
# NOTE: case folding does not preserve normalization formats!
|
||||
#
|
||||
# For information on case folding, see
|
||||
# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
|
||||
#
|
||||
# ================================================================================
|
||||
# Format
|
||||
# ================================================================================
|
||||
# The entries in this file are in the following machine-readable format:
|
||||
#
|
||||
# <code>; <status>; <mapping>; # <name>
|
||||
#
|
||||
# The status field is:
|
||||
# C: common case folding, common mappings shared by both simple and full mappings.
|
||||
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
|
||||
# S: simple case folding, mappings to single characters where different from F.
|
||||
# I: special case for dotted uppercase I and dotless lowercase i
|
||||
# - If this mapping is included, the result is case-insensitive, but dotless and dotted I's are not distinguished.
|
||||
# - If this mapping is excluded, the result is not fully case-insensitive, but dotless and dotted I's are distinguished.
|
||||
#
|
||||
# Usage:
|
||||
# A. To do a simple case folding, use the mappings with status C + S + I.
|
||||
# B. To do a full case folding, use the mappings with status C + F + I.
|
||||
# The mappings with status I can be omitted depending on the desired case-folding
|
||||
# behavior. (The default option is to retain them.)
|
||||
#
|
||||
# =================================================================
|
||||
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2001/12/13 23:35:54 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -87,8 +87,6 @@ public final class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
*/
|
||||
private UnicodeProperty[] dprops = new UnicodeProperty[50];
|
||||
private Normalizer[] nf = new Normalizer[4];
|
||||
private Normalizer nfd, nfc, nfkd, nfkc;
|
||||
|
||||
static final String[] CaseNames = {
|
||||
"Uppercase",
|
||||
|
@ -99,7 +97,7 @@ public final class DerivedProperty implements UCD_Types {
|
|||
Normalizer nfx;
|
||||
ExDProp(int i) {
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
nfx = Main.nf[i];
|
||||
name = "Expands_On_" + nfx.getName();
|
||||
shortName = "XO_" + nfx.getName();
|
||||
header = "# Derived Property: " + name
|
||||
|
@ -123,7 +121,7 @@ public final class DerivedProperty implements UCD_Types {
|
|||
NF_UnsafeStartProp(int i) {
|
||||
isStandard = false;
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
nfx = Main.nf[i];
|
||||
name = nfx.getName() + "_UnsafeStart";
|
||||
shortName = nfx.getName() + "_SS";
|
||||
header = "# Derived Property: " + name
|
||||
|
@ -159,7 +157,7 @@ public final class DerivedProperty implements UCD_Types {
|
|||
case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
|
||||
}
|
||||
filter = bitsets[1] != null;
|
||||
nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
|
||||
Main.nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
|
||||
|
||||
name = Names[i-NFC_Leading];
|
||||
shortName = SNames[i-NFC_Leading];
|
||||
|
@ -193,19 +191,19 @@ public final class DerivedProperty implements UCD_Types {
|
|||
|
||||
GenDProp (int i) {
|
||||
isStandard = false;
|
||||
valueVaries = true;
|
||||
setValueType(NON_ENUMERATED);
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
nfx = Main.nf[i];
|
||||
name = nfx.getName();
|
||||
String compName = "the character itself";
|
||||
|
||||
if (i == NFKC || i == NFD) {
|
||||
name += "-NFC";
|
||||
nfComp = nfc;
|
||||
nfComp = Main.nfc;
|
||||
compName = "NFC for the character";
|
||||
} else if (i == NFKD) {
|
||||
name += "-NFD";
|
||||
nfComp = nfd;
|
||||
nfComp = Main.nfd;
|
||||
compName = "NFD for the character";
|
||||
}
|
||||
header = "# Derived Property: " + name
|
||||
|
@ -269,9 +267,9 @@ public final class DerivedProperty implements UCD_Types {
|
|||
String MAYBE;
|
||||
Normalizer nfx;
|
||||
QuickDProp (int i) {
|
||||
valueVaries = true;
|
||||
setValueType((i == NFC || i == NFKC) ? ENUMERATED : BINARY);
|
||||
type = DERIVED_NORMALIZATION;
|
||||
nfx = nf[i];
|
||||
nfx = Main.nf[i];
|
||||
NO = nfx.getName() + "_NO";
|
||||
MAYBE = nfx.getName() + "_MAYBE";
|
||||
name = nfx.getName() + "_QuickCheck";
|
||||
|
@ -291,11 +289,6 @@ public final class DerivedProperty implements UCD_Types {
|
|||
};
|
||||
|
||||
{
|
||||
nfd = nf[0] = new Normalizer(Normalizer.NFD);
|
||||
nfc = nf[1] = new Normalizer(Normalizer.NFC);
|
||||
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
|
||||
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
|
||||
|
||||
for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
|
||||
dprops[i] = new ExDProp(i-ExpandsOnNFD);
|
||||
}
|
||||
|
@ -493,7 +486,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
dprops[FC_NFKC_Closure] = new UnicodeProperty() {
|
||||
{
|
||||
type = DERIVED_NORMALIZATION;
|
||||
valueVaries = true;
|
||||
setValueType(NON_ENUMERATED);
|
||||
name = "FC_NFKC_Closure";
|
||||
shortName = "FC_NFKC";
|
||||
header = "# Derived Property: " + name
|
||||
|
@ -503,8 +496,8 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
public String getValue(int cp, byte style) {
|
||||
if (!ucdData.isRepresented(cp)) return "";
|
||||
String b = nfkc.normalize(fold(cp));
|
||||
String c = nfkc.normalize(fold(b));
|
||||
String b = Main.nfkc.normalize(fold(cp));
|
||||
String c = Main.nfkc.normalize(fold(b));
|
||||
if (c.equals(b)) return "";
|
||||
return "FNC; " + Utility.hex(c);
|
||||
} // default
|
||||
|
@ -516,7 +509,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
type = DERIVED_NORMALIZATION;
|
||||
isStandard = false;
|
||||
name = "FC_NFC_Closure";
|
||||
valueVaries = true;
|
||||
setValueType(NON_ENUMERATED);
|
||||
shortName = "FC_NFC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));"
|
||||
|
@ -525,8 +518,8 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
public String getValue(int cp, byte style) {
|
||||
if (!ucdData.isRepresented(cp)) return "";
|
||||
String b = nfc.normalize(fold(cp));
|
||||
String c = nfc.normalize(fold(b));
|
||||
String b = Main.nfc.normalize(fold(cp));
|
||||
String c = Main.nfc.normalize(fold(b));
|
||||
if (c.equals(b)) return "";
|
||||
return "FN; " + Utility.hex(c);
|
||||
} // default
|
||||
|
@ -603,8 +596,9 @@ of characters, the first of which has a non-zero combining class.
|
|||
dprops[Type_i] = new UnicodeProperty() {
|
||||
{
|
||||
type = DERIVED_CORE;
|
||||
name = "Soft_Dotted";
|
||||
shortName = "SDot";
|
||||
isStandard = false;
|
||||
name = "DSoft_Dotted";
|
||||
shortName = "DSDot";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: all characters whose canonical decompositions end with a combining character sequence that"
|
||||
+ "\r\n# - starts with i or j"
|
||||
|
@ -613,21 +607,24 @@ of characters, the first of which has a non-zero combining class.
|
|||
;
|
||||
}
|
||||
boolean hasValue(int cp) {
|
||||
if (cp == 'i' || cp == 'j') return true;
|
||||
if (!nfkd.hasDecomposition(cp)) return false;
|
||||
String decomp = nfd.normalize(cp);
|
||||
if (hasSoftDot(cp)) return true;
|
||||
if (!Main.nfkd.hasDecomposition(cp)) return false;
|
||||
String decomp = Main.nfd.normalize(cp);
|
||||
boolean ok = false;
|
||||
for (int i = decomp.length()-1; i >= 0; --i) {
|
||||
char ch = decomp.charAt(i);
|
||||
int ch = UTF16.charAt(decomp, i);
|
||||
int cc = ucdData.getCombiningClass(ch);
|
||||
if (cc == 230) return false;
|
||||
if (cc == 0) {
|
||||
if (ch == 'i' || ch == 'j') ok = true;
|
||||
else return false;
|
||||
if (!hasSoftDot(ch)) return false;
|
||||
ok = true;
|
||||
}
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
boolean hasSoftDot(int ch) {
|
||||
return ch == 'i' || ch == 'j' || ch == 0x0268 || ch == 0x0456 || ch == 0x0458;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Case_Ignorable] = new UnicodeProperty() {
|
||||
|
@ -666,7 +663,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
for (int i = 0; i < dprops.length; ++i) {
|
||||
UnicodeProperty up = dprops[i];
|
||||
if (up == null) continue;
|
||||
if (up.valueVaries()) continue;
|
||||
if (up.getValueType() != BINARY) continue;
|
||||
up.setValue(NUMBER, "1");
|
||||
up.setValue(SHORT, "Y");
|
||||
up.setValue(LONG, "YES");
|
||||
|
@ -681,11 +678,11 @@ of characters, the first of which has a non-zero combining class.
|
|||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
|
||||
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
|
||||
|
||||
// if (true) throw new IllegalArgumentException("FIX nf[2]");
|
||||
// if (true) throw new IllegalArgumentException("FIX Main.nf[2]");
|
||||
|
||||
if (!nf[NFKD].normalizationDiffers(cp)) return Lo;
|
||||
if (!Main.nf[NFKD].normalizationDiffers(cp)) return Lo;
|
||||
|
||||
String norm = nf[NFKD].normalize(cp);
|
||||
String norm = Main.nf[NFKD].normalize(cp);
|
||||
int cp2;
|
||||
boolean gotUpper = false;
|
||||
boolean gotLower = false;
|
||||
|
@ -723,8 +720,8 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
|
||||
public static void test() {
|
||||
UCD ucd = UCD.make();
|
||||
DerivedProperty dprop = new DerivedProperty(ucd);
|
||||
Main.setUCD();
|
||||
DerivedProperty dprop = new DerivedProperty(Main.ucd);
|
||||
/*
|
||||
for (int j = 0; j < LIMIT; ++j) {
|
||||
System.out.println();
|
||||
|
@ -735,9 +732,9 @@ of characters, the first of which has a non-zero combining class.
|
|||
|
||||
for (int cp = 0xA0; cp < 0xFF; ++cp) {
|
||||
System.out.println();
|
||||
System.out.println(ucd.getCodeAndName(cp));
|
||||
System.out.println(Main.ucd.getCodeAndName(cp));
|
||||
for (int j = 0; j < DERIVED_PROPERTY_LIMIT; ++j) {
|
||||
String prop = make(j, ucd).getValue(cp);
|
||||
String prop = make(j, Main.ucd).getValue(cp);
|
||||
if (prop.length() != 0) System.out.println("\t" + prop);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2001/12/13 23:35:56 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -34,7 +34,7 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
this.ucdData = ucd;
|
||||
// this.dprop = new DerivedProperty(ucd);
|
||||
uprop = DerivedProperty.make(propMask, ucd);
|
||||
varies = uprop.valueVaries();
|
||||
varies = uprop.getValueType() != BINARY;
|
||||
|
||||
width = super.minPropertyWidth();
|
||||
switch (propMask) {
|
||||
|
@ -56,7 +56,7 @@ final class DerivedPropertyLister extends PropertyLister {
|
|||
}
|
||||
|
||||
public String valueName(int cp) {
|
||||
if (uprop.valueVaries()) return uprop.getValue(cp, LONG);
|
||||
if (uprop.getValueType() != BINARY) return uprop.getValue(cp, LONG);
|
||||
return uprop.getProperty(LONG);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/12/13 23:35:56 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -15,38 +15,72 @@ package com.ibm.text.UCD;
|
|||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
import com.ibm.text.UTF16;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public class GenerateCaseFolding implements UCD_Types {
|
||||
public static boolean DEBUG = false;
|
||||
public static UCD ucd = UCD.make("");
|
||||
|
||||
public static void main(String[] args) throws java.io.IOException {
|
||||
makeCaseFold();
|
||||
public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
|
||||
public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
|
||||
public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
|
||||
// PICK_SHORT & NF_CLOSURE = false for old style
|
||||
|
||||
|
||||
/*public static void main(String[] args) throws java.io.IOException {
|
||||
makeCaseFold(arg[0]);
|
||||
//getAge();
|
||||
}
|
||||
|
||||
public static void makeCaseFold() throws java.io.IOException {
|
||||
*/
|
||||
|
||||
static PrintWriter log;
|
||||
|
||||
public static void makeCaseFold(boolean normalized) throws java.io.IOException {
|
||||
PICK_SHORT = NF_CLOSURE = normalized;
|
||||
|
||||
Main.setUCD();
|
||||
log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true));
|
||||
System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
|
||||
|
||||
System.out.println("Making Full Data");
|
||||
Map fullData = getCaseFolding(true);
|
||||
Map fullData = getCaseFolding(true, NF_CLOSURE);
|
||||
Utility.fixDot();
|
||||
System.out.println("Making Simple Data");
|
||||
Map simpleData = getCaseFolding(false);
|
||||
Map simpleData = getCaseFolding(false, NF_CLOSURE);
|
||||
// write the data
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Writing");
|
||||
String filename = "CaseFolding";
|
||||
if (normalized) filename += "-Normalized";
|
||||
String directory = "DerivedData/";
|
||||
PrintWriter out = Utility.openPrintWriter(directory + filename + GenerateData.getFileSuffix(true));
|
||||
GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true));
|
||||
|
||||
out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
|
||||
out.println("#");
|
||||
out.println("# Generated: " + new Date() + ", MD");
|
||||
Utility.appendFile("CaseFoldingHeader.txt", false, out);
|
||||
|
||||
/*
|
||||
PrintWriter out = new PrintWriter(
|
||||
new BufferedWriter(
|
||||
new OutputStreamWriter(
|
||||
new FileOutputStream("CaseFoldingSample.txt"),
|
||||
new FileOutputStream(directory + fileRoot + GenerateData.getFileSuffix()),
|
||||
"UTF8"),
|
||||
4*1024));
|
||||
*/
|
||||
|
||||
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
if (!charsUsed.get(ch)) continue;
|
||||
|
||||
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
|
||||
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
|
||||
if (rFull == null && rSimple == null) continue;
|
||||
if (rFull != null && rFull.equals(rSimple)) {
|
||||
if (rFull != null && rFull.equals(rSimple)
|
||||
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
|
||||
String type = "C";
|
||||
if (ch == 0x130 || ch == 0x131) type = "I";
|
||||
drawLine(out, ch, type, rFull);
|
||||
|
@ -60,35 +94,63 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
}
|
||||
out.close();
|
||||
log.close();
|
||||
}
|
||||
|
||||
static void drawLine(PrintWriter out, int ch, String type, String result) {
|
||||
String comment = "";
|
||||
if (COMMENT_DIFFS) {
|
||||
String lower = Main.ucd.getCase(UTF16.valueOf(ch), FULL, LOWER);
|
||||
if (!lower.equals(result)) {
|
||||
String upper = Main.ucd.getCase(UTF16.valueOf(ch), FULL, UPPER);
|
||||
String lower2 = Main.ucd.getCase(UTF16.valueOf(ch), FULL, LOWER);
|
||||
if (lower.equals(lower2)) {
|
||||
comment = "[Diff " + Utility.hex(lower, " ") + "] ";
|
||||
} else {
|
||||
Utility.fixDot();
|
||||
System.out.println("PROBLEM WITH: " + Main.ucd.getCodeAndName(ch));
|
||||
comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] ";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.println(Utility.hex(ch)
|
||||
+ "; " + type +
|
||||
"; " + Utility.hex(result, " ") +
|
||||
"; # " + ucd.getName(ch));
|
||||
+ "; " + type
|
||||
+ "; " + Utility.hex(result, " ")
|
||||
+ "; # " + comment + Main.ucd.getName(ch));
|
||||
}
|
||||
|
||||
static int probeCh = 0x01f0;
|
||||
static String shower = UTF16.valueOf(probeCh);
|
||||
|
||||
static Map getCaseFolding(boolean full) throws java.io.IOException {
|
||||
static Map getCaseFolding(boolean full, boolean nfClose) throws java.io.IOException {
|
||||
Map data = new TreeMap();
|
||||
Map repChar = new TreeMap();
|
||||
//String option = "";
|
||||
|
||||
// get the equivalence classes
|
||||
|
||||
for (int ch = 0; ch < 0x10FFFF; ++ch) {
|
||||
if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
|
||||
if (!ucd.isRepresented(ch)) continue;
|
||||
getClosure(ch, data, full);
|
||||
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
//if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
|
||||
if (!Main.ucd.isRepresented(ch)) continue;
|
||||
getClosure(ch, data, full, nfClose);
|
||||
}
|
||||
|
||||
// get the representative characters
|
||||
|
||||
|
||||
Iterator it = data.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
Set set = (Set) data.get(s);
|
||||
show = set.contains(shower);
|
||||
if (show) {
|
||||
Utility.fixDot();
|
||||
System.out.println(toString(set));
|
||||
}
|
||||
|
||||
// Pick the best available representative
|
||||
|
||||
String rep = null;
|
||||
int repGood = 0;
|
||||
String dup = null;
|
||||
|
@ -104,30 +166,63 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
dup = s2;
|
||||
}
|
||||
}
|
||||
if (rep == null) System.err.println("No representative for: " + toString(set));
|
||||
else if (repGood < 128) {
|
||||
System.err.println("Non-optimal!!: "
|
||||
+ ucd.getName(rep) + ", " + toString(set,true));
|
||||
if (rep == null) {
|
||||
Utility.fixDot();
|
||||
System.err.println("No representative for: " + toString(set));
|
||||
} else if ((repGood & (NFC_FORMAT | ISLOWER)) != (NFC_FORMAT | ISLOWER)) {
|
||||
String message = "";
|
||||
if ((repGood & NFC_FORMAT) == 0) {
|
||||
message += " [NOT NFC FORMAT]";
|
||||
}
|
||||
if ((repGood & ISLOWER) == 0) {
|
||||
message += " [NOT LOWERCASE]";
|
||||
}
|
||||
Utility.fixDot();
|
||||
log.println("Non-Optimal Representative " + message);
|
||||
log.println(" Rep:\t" + Main.ucd.getCodeAndName(rep));
|
||||
log.println(" Set:\t" + toString(set,true, true));
|
||||
}
|
||||
|
||||
// Add it for all the elements of the set
|
||||
|
||||
it2 = set.iterator();
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String)it2.next();
|
||||
if (s2.length() == 1 && !s2.equals(rep)) repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
|
||||
if (UTF16.countCodePoint(s2) == 1 && !s2.equals(rep)) {
|
||||
repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
|
||||
charsUsed.set(UTF16.charAt(s2, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
return repChar;
|
||||
}
|
||||
|
||||
static BitSet charsUsed = new BitSet();
|
||||
static boolean show = false;
|
||||
static final int NFC_FORMAT = 64;
|
||||
static final int ISLOWER = 128;
|
||||
|
||||
static int goodness(String s, boolean full) {
|
||||
if (s == null) return 0;
|
||||
int result = s.length();
|
||||
if (s.equals(lower(upper(s, full), full))) result |= 128;
|
||||
if (s.equals(NFC.normalize(s))) result |= 64;
|
||||
int result = 32-s.length();
|
||||
if (!PICK_SHORT) {
|
||||
result = s.length();
|
||||
}
|
||||
if (!full) result <<= 8;
|
||||
String low = lower(upper(s, full), full);
|
||||
if (s.equals(low)) result |= ISLOWER;
|
||||
else if (PICK_SHORT && Main.nfd.normalize(s).equals(Main.nfd.normalize(low))) result |= ISLOWER;
|
||||
|
||||
if (s.equals(Main.nfc.normalize(s))) result |= NFC_FORMAT;
|
||||
|
||||
if (show) {
|
||||
Utility.fixDot();
|
||||
System.out.println(Utility.hex(result) + ", " + Main.ucd.getCodeAndName(s));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static Normalizer NFC = new Normalizer(Normalizer.NFC);
|
||||
/*
|
||||
static HashSet temp = new HashSet();
|
||||
static void normalize(HashSet set) {
|
||||
|
@ -151,33 +246,33 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
|
||||
/*
|
||||
String
|
||||
String lower1 = ucd.getLowercase(ch);
|
||||
String lower2 = ucd.toLowercase(ch,option);
|
||||
String lower1 = Main.ucd.getLowercase(ch);
|
||||
String lower2 = Main.ucd.toLowercase(ch,option);
|
||||
|
||||
char ch2 = ucd.getLowercase(ucd.getUppercase(ch).charAt(0)).charAt(0);
|
||||
//String lower1 = String.valueOf(ucd.getLowercase(ch));
|
||||
//String lower = ucd.toLowercase(ch2,option);
|
||||
String upper = ucd.toUppercase(ch2,option);
|
||||
String lowerUpper = ucd.toLowercase(upper,option);
|
||||
//String title = ucd.toTitlecase(ch2,option);
|
||||
//String lowerTitle = ucd.toLowercase(upper,option);
|
||||
char ch2 = Main.ucd.getLowercase(Main.ucd.getUppercase(ch).charAt(0)).charAt(0);
|
||||
//String lower1 = String.valueOf(Main.ucd.getLowercase(ch));
|
||||
//String lower = Main.ucd.toLowercase(ch2,option);
|
||||
String upper = Main.ucd.toUppercase(ch2,option);
|
||||
String lowerUpper = Main.ucd.toLowercase(upper,option);
|
||||
//String title = Main.ucd.toTitlecase(ch2,option);
|
||||
//String lowerTitle = Main.ucd.toLowercase(upper,option);
|
||||
|
||||
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
|
||||
output.println(Utility.hex(ch)
|
||||
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
|
||||
+ "; " + Utility.hex(lowerUpper," ")
|
||||
+ ";\t#" + ucd.getName(ch)
|
||||
+ ";\t#" + Main.ucd.getName(ch)
|
||||
);
|
||||
//if (!lowerUpper.equals(lower)) {
|
||||
// output.println("Warning1: " + Utility.hex(lower) + " " + ucd.getName(lower));
|
||||
// output.println("Warning1: " + Utility.hex(lower) + " " + Main.ucd.getName(lower));
|
||||
//}
|
||||
//if (!lowerUpper.equals(lowerTitle)) {
|
||||
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + ucd.getName(lowerTitle));
|
||||
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Main.ucd.getName(lowerTitle));
|
||||
//}
|
||||
}
|
||||
*/
|
||||
|
||||
static void getClosure(int ch, Map data, boolean full) {
|
||||
static void getClosure(int ch, Map data, boolean full, boolean nfClose) {
|
||||
String charStr = UTF32.valueOf32(ch);
|
||||
String lowerStr = lower(charStr, full);
|
||||
String titleStr = title(charStr, full);
|
||||
|
@ -202,7 +297,13 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
while (it.hasNext()) {
|
||||
String s = (String) it.next();
|
||||
// do funny stuff since we can't modify set while iterating
|
||||
//if (add(set, NFC.normalize(s), data)) continue main;
|
||||
// We don't do this because if the source is not normalized, we don't want to normalize
|
||||
if (nfClose) {
|
||||
if (add(set, Main.nfd.normalize(s), data)) continue main;
|
||||
if (add(set, Main.nfc.normalize(s), data)) continue main;
|
||||
if (add(set, Main.nfkd.normalize(s), data)) continue main;
|
||||
if (add(set, Main.nfkc.normalize(s), data)) continue main;
|
||||
}
|
||||
if (add(set, lower(s, full), data)) continue main;
|
||||
if (add(set, title(s, full), data)) continue main;
|
||||
if (add(set, upper(s, full), data)) continue main;
|
||||
|
@ -216,31 +317,34 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
return result.replace('\u03C2', '\u03C3'); // HACK for lower
|
||||
}
|
||||
|
||||
// These functions are no longer necessary, since UCD is parameterized,
|
||||
// These functions are no longer necessary, since Main.ucd is parameterized,
|
||||
// but it's not worth changing
|
||||
|
||||
static String lower2(String s, boolean full) {
|
||||
if (!full) {
|
||||
/*if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
|
||||
return Main.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
|
||||
}
|
||||
return ucd.getCase(s, FULL, LOWER);
|
||||
*/
|
||||
return Main.ucd.getCase(s, full ? FULL : SIMPLE, LOWER);
|
||||
}
|
||||
|
||||
static String upper(String s, boolean full) {
|
||||
if (!full) {
|
||||
/* if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
|
||||
return Main.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
|
||||
}
|
||||
return ucd.getCase(s, SIMPLE, UPPER);
|
||||
*/
|
||||
return Main.ucd.getCase(s, full ? FULL : SIMPLE, UPPER);
|
||||
}
|
||||
|
||||
static String title(String s, boolean full) {
|
||||
if (!full) {
|
||||
/*if (!full) {
|
||||
if (s.length() != 1) return s;
|
||||
return ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
|
||||
return Main.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
|
||||
}
|
||||
return ucd.getCase(s, SIMPLE, TITLE);
|
||||
*/
|
||||
return Main.ucd.getCase(s, full ? FULL : SIMPLE, TITLE);
|
||||
}
|
||||
|
||||
static boolean add(Set set, String s, Map data) {
|
||||
|
@ -261,28 +365,173 @@ public class GenerateCaseFolding implements UCD_Types {
|
|||
}
|
||||
|
||||
static String toString(Set set) {
|
||||
String result = "{";
|
||||
Iterator it2 = set.iterator();
|
||||
boolean first = true;
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String) it2.next();
|
||||
if (!first) result += ", ";
|
||||
first = false;
|
||||
result += Utility.hex(s2, " ");
|
||||
}
|
||||
return result + "}";
|
||||
return toString(set, false, false);
|
||||
}
|
||||
|
||||
static String toString(Set set, boolean t) {
|
||||
static String toString(Set set, boolean name, boolean crtab) {
|
||||
String result = "{";
|
||||
Iterator it2 = set.iterator();
|
||||
boolean first = true;
|
||||
while (it2.hasNext()) {
|
||||
String s2 = (String) it2.next();
|
||||
if (!first) result += ", ";
|
||||
if (!first) {
|
||||
if (crtab) {
|
||||
result += ";\r\n\t";
|
||||
} else {
|
||||
result += "; ";
|
||||
}
|
||||
}
|
||||
first = false;
|
||||
result += ucd.getName(s2);
|
||||
if (name) {
|
||||
result += Main.ucd.getCodeAndName(s2);
|
||||
} else {
|
||||
result += Utility.hex(s2, " ");
|
||||
}
|
||||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
static boolean specialNormalizationDiffers(int ch) {
|
||||
if (ch == 0x00DF) return true; // es-zed
|
||||
return Main.nfkd.normalizationDiffers(ch);
|
||||
}
|
||||
|
||||
static String specialNormalization(String s) {
|
||||
if (s.equals("\u00DF")) return "ss";
|
||||
return Main.nfkd.normalize(s);
|
||||
}
|
||||
|
||||
static boolean isExcluded(int ch) {
|
||||
if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij
|
||||
if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI
|
||||
if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
|
||||
if (0x20A8 <= ch && ch <= 0x217B) return true; // skip Rupee..
|
||||
|
||||
byte type = Main.ucd.getDecompositionType(ch);
|
||||
if (type == COMPAT_SQUARE) return true;
|
||||
//if (type == COMPAT_UNSPECIFIED) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static void generateSpecialCasing() throws IOException {
|
||||
Main.setUCD();
|
||||
Map sorted = new TreeMap();
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions" + GenerateData.getFileSuffix(true));
|
||||
|
||||
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
|
||||
Utility.dot(ch);
|
||||
if (!Main.ucd.isRepresented(ch)) continue;
|
||||
if (!specialNormalizationDiffers(ch)) continue;
|
||||
|
||||
String lower = Main.nfc.normalize(Main.ucd.getCase(ch, SIMPLE, LOWER));
|
||||
String upper = Main.nfc.normalize(Main.ucd.getCase(ch, SIMPLE, UPPER));
|
||||
String title = Main.nfc.normalize(Main.ucd.getCase(ch, SIMPLE, TITLE));
|
||||
|
||||
String chstr = UTF16.valueOf(ch);
|
||||
|
||||
String decomp = specialNormalization(chstr);
|
||||
String flower = Main.nfc.normalize(Main.ucd.getCase(decomp, SIMPLE, LOWER));
|
||||
String fupper = Main.nfc.normalize(Main.ucd.getCase(decomp, SIMPLE, UPPER));
|
||||
String ftitle = Main.nfc.normalize(Main.ucd.getCase(decomp, SIMPLE, TITLE));
|
||||
|
||||
String base = Main.nfc.normalize(decomp);
|
||||
String blower = Main.nfc.normalize(specialNormalization(lower));
|
||||
String bupper = Main.nfc.normalize(specialNormalization(upper));
|
||||
String btitle = Main.nfc.normalize(specialNormalization(title));
|
||||
|
||||
if (ch == 0x249c) {
|
||||
System.out.println("Code: " + Main.ucd.getCodeAndName(ch));
|
||||
System.out.println("Decomp: " + Main.ucd.getCodeAndName(decomp));
|
||||
System.out.println("Base: " + Main.ucd.getCodeAndName(base));
|
||||
System.out.println("SLower: " + Main.ucd.getCodeAndName(lower));
|
||||
System.out.println("FLower: " + Main.ucd.getCodeAndName(flower));
|
||||
System.out.println("BLower: " + Main.ucd.getCodeAndName(blower));
|
||||
System.out.println("STitle: " + Main.ucd.getCodeAndName(title));
|
||||
System.out.println("FTitle: " + Main.ucd.getCodeAndName(ftitle));
|
||||
System.out.println("BTitle: " + Main.ucd.getCodeAndName(btitle));
|
||||
System.out.println("SUpper: " + Main.ucd.getCodeAndName(upper));
|
||||
System.out.println("FUpper: " + Main.ucd.getCodeAndName(fupper));
|
||||
System.out.println("BUpper: " + Main.ucd.getCodeAndName(bupper));
|
||||
}
|
||||
|
||||
// presumably if there is a single code point, it would already be in the simple mappings
|
||||
|
||||
if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
|
||||
&& UTF16.countCodePoint(title) == 1) continue;
|
||||
|
||||
// if there is no change from the base, skip
|
||||
|
||||
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) continue;
|
||||
|
||||
// fix special cases
|
||||
// if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
|
||||
if (flower.equals(blower)) flower = lower;
|
||||
if (fupper.equals(bupper)) fupper = upper;
|
||||
if (ftitle.equals(btitle)) ftitle = title;
|
||||
|
||||
// if there are no changes from the original, or the expanded original, skip
|
||||
|
||||
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) continue;
|
||||
|
||||
String name = Main.ucd.getName(ch);
|
||||
String mapping = Utility.hex(ch)
|
||||
+ "; " + Utility.hex(flower.equals(base) ? chstr : flower)
|
||||
+ "; " + Utility.hex(ftitle.equals(base) ? chstr : ftitle)
|
||||
+ "; " + Utility.hex(fupper.equals(base) ? chstr : fupper)
|
||||
+ "; # " + Main.ucd.getName(ch);
|
||||
|
||||
int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
|
||||
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 3
|
||||
: name.indexOf("LIGATURE") >= 0 ? 2
|
||||
: name.indexOf("GEGRAMMENI") < 0 ? 4
|
||||
: UTF16.countCodePoint(ftitle) == 1 ? 5
|
||||
: UTF16.countCodePoint(fupper) == 2 ? 6
|
||||
: 7;
|
||||
|
||||
|
||||
// special exclusions
|
||||
if (isExcluded(ch)) {
|
||||
log.println("# " + mapping);
|
||||
} else {
|
||||
sorted.put(new Integer((order << 24) | ch), mapping);
|
||||
}
|
||||
}
|
||||
log.close();
|
||||
|
||||
System.out.println("Writing");
|
||||
PrintWriter out = Utility.openPrintWriter("DerivedData/SpecialCasing" + GenerateData.getFileSuffix(true));
|
||||
GenerateData.generateBat("DerivedData/", "SpecialCasing", GenerateData.getFileSuffix(true));
|
||||
Utility.appendFile("SpecialCasingHeader.txt", true, out);
|
||||
|
||||
Iterator it = sorted.keySet().iterator();
|
||||
int lastOrder = -1;
|
||||
while (it.hasNext()) {
|
||||
Integer key = (Integer) it.next();
|
||||
String line = (String) sorted.get(key);
|
||||
int order = key.intValue() >> 24;
|
||||
if (order != lastOrder) {
|
||||
lastOrder = order;
|
||||
out.println();
|
||||
boolean skipLine = false;
|
||||
switch(order) {
|
||||
case 1:
|
||||
out.println("# The German es-zed is special--the normal mapping is to SS.");
|
||||
out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
|
||||
break;
|
||||
case 2: out.println("# Ligatures"); break;
|
||||
case 3: skipLine = true; break;
|
||||
case 4: out.println("# No corresponding uppercase precomposed character"); break;
|
||||
case 5: Utility.appendFile("SpecialCasingIota.txt", true, out); break;
|
||||
case 6: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break;
|
||||
case 7: skipLine = true; break;
|
||||
}
|
||||
if (!skipLine) out.println();
|
||||
}
|
||||
out.println(line);
|
||||
}
|
||||
Utility.appendFile("SpecialCasingFooter.txt", true, out);
|
||||
out.close();
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MLStreamWriter.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -245,7 +245,7 @@ public class MLStreamWriter extends Writer {
|
|||
boolean isHTML;
|
||||
ArrayList stack = new ArrayList();
|
||||
boolean inElement = false;
|
||||
Normalizer formC = new Normalizer(Normalizer.NFC);
|
||||
Normalizer formC = new Normalizer(Normalizer.NFC, "");
|
||||
int len;
|
||||
int maxLineLength = 60;
|
||||
// later, add better line end management, indenting
|
||||
|
|
|
@ -5,17 +5,59 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2001/12/13 23:35:56 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import java.util.Date;
|
||||
|
||||
public final class Main {
|
||||
static String ucdVersion = "";
|
||||
public final class Main implements UCD_Types {
|
||||
|
||||
static String ucdVersion = UCD.latestVersion;
|
||||
static UCD ucd;
|
||||
static Normalizer nfc;
|
||||
static Normalizer nfd;
|
||||
static Normalizer nfkc;
|
||||
static Normalizer nfkd;
|
||||
static Normalizer[] nf = new Normalizer[4];
|
||||
|
||||
static void setUCD() {
|
||||
ucd = UCD.make(Main.ucdVersion);
|
||||
nfd = nf[NFD] = new Normalizer(Normalizer.NFD, Main.ucdVersion);
|
||||
nfc = nf[NFC] = new Normalizer(Normalizer.NFC, Main.ucdVersion);
|
||||
nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, Main.ucdVersion);
|
||||
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, Main.ucdVersion);
|
||||
System.out.println("Loaded UCD" + ucd.getVersion() + " " + (new Date(Main.ucd.getDate())));
|
||||
}
|
||||
|
||||
static final String[] ALL_FILES = {
|
||||
"CaseFolding",
|
||||
"CompositionExclusions",
|
||||
"DerivedBidiClass",
|
||||
"DerivedBinaryProperties",
|
||||
"DerivedCombiningClass",
|
||||
"DerivedCoreProperties",
|
||||
"DerivedDecompositionType",
|
||||
"DerivedEastAsianWidth",
|
||||
"DerivedGeneralCategory",
|
||||
"DerivedJoiningGroup",
|
||||
"DerivedJoiningType",
|
||||
"DerivedLineBreak",
|
||||
"DerivedNormalizationProperties",
|
||||
"DerivedNumericType",
|
||||
"DerivedNumericValues",
|
||||
"NormalizationTest",
|
||||
"PropertyAliases",
|
||||
"PropList",
|
||||
"Scripts",
|
||||
"SpecialCasing",
|
||||
"DerivedAge",
|
||||
//"OtherDerivedProperties",
|
||||
};
|
||||
|
||||
public static void main (String[] args) throws Exception {
|
||||
|
||||
|
@ -26,19 +68,19 @@ public final class Main {
|
|||
Utility.fixDot();
|
||||
System.out.println("Argument: " + args[i]);
|
||||
|
||||
if (arg.equalsIgnoreCase("all")) {
|
||||
//checkCase();
|
||||
if (arg.equalsIgnoreCase("verify")) {
|
||||
VerifyUCD.verify();
|
||||
VerifyUCD.checkCanonicalProperties();
|
||||
VerifyUCD.CheckCaseFold();
|
||||
VerifyUCD.checkAgainstUInfo();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("build")) {
|
||||
ConvertUCD.main(new String[]{ucdVersion});
|
||||
} else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
|
||||
} else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{ucdVersion});
|
||||
else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
|
||||
else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null);
|
||||
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
|
||||
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
|
||||
else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main();
|
||||
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
|
||||
|
||||
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
|
||||
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
|
||||
|
@ -52,19 +94,180 @@ public final class Main {
|
|||
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
|
||||
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
|
||||
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();
|
||||
else if (arg.equalsIgnoreCase("Generate")) GenerateData.main(ucdVersion, Utility.split(args[++i],','));
|
||||
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
|
||||
else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
|
||||
/*else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
|
||||
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
|
||||
*/
|
||||
else {
|
||||
System.out.println("Unknown option -- must be one of the following (case-insensitive)");
|
||||
System.out.println("generateXML, checkCase, checkCanonicalProperties, CheckCaseFold,");
|
||||
System.out.println("VerifyIDN, NFTest, test1, ");
|
||||
// System.out.println(checkAgainstUInfo,");
|
||||
System.out.println("checkScripts, IdentifierTest, writeNormalizerTestSuite");
|
||||
}
|
||||
else extras(new String[] {arg});
|
||||
}
|
||||
}
|
||||
|
||||
public static void extras (String[] args) throws Exception {
|
||||
//ubp = new UnifiedBinaryProperty(ucd);
|
||||
|
||||
boolean expanding = false;
|
||||
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
String arg = args[i];
|
||||
if (arg.charAt(0) == '#') return; // skip rest of line
|
||||
long mask = 0;
|
||||
|
||||
Utility.fixDot();
|
||||
if (expanding) System.out.println("Argument: " + args[i]);
|
||||
|
||||
if (arg.equalsIgnoreCase("All")) {
|
||||
// Append all args at end
|
||||
String[] temp = new String[args.length + ALL_FILES.length];
|
||||
System.arraycopy(args, 0, temp, 0, args.length);
|
||||
System.arraycopy(ALL_FILES, 0, temp, args.length, ALL_FILES.length);
|
||||
args = temp;
|
||||
expanding = true;
|
||||
|
||||
// EXTRACTED PROPERTIES
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
|
||||
GenerateData.generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedBidiClass");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedBinaryProperties")) {
|
||||
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES+1, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedBinaryProperties" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedCombiningClass")) {
|
||||
GenerateData.generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedCombiningClass" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedDecompositionType")) {
|
||||
GenerateData.generateVerticalSlice(DECOMPOSITION_TYPE, DECOMPOSITION_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedDecompositionType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
|
||||
GenerateData.generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedEastAsianWidth" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedGeneralCategory")) {
|
||||
GenerateData.generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedGeneralCategory" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedJoiningGroup")) {
|
||||
GenerateData.generateVerticalSlice(JOINING_GROUP, JOINING_GROUP+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedJoiningGroup" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedJoiningType")) {
|
||||
GenerateData.generateVerticalSlice(JOINING_TYPE, JOINING_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedJoiningType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedLineBreak")) {
|
||||
GenerateData.generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedLineBreak" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedNumericType")) {
|
||||
GenerateData.generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedNumericType" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
|
||||
GenerateData.generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/DerivedExtractedProperties/", "DerivedNumericValues" );
|
||||
|
||||
// OTHER STANDARD PROPERTIES
|
||||
|
||||
} else if (arg.equalsIgnoreCase("CaseFolding")) {
|
||||
GenerateCaseFolding.makeCaseFold(true);
|
||||
GenerateCaseFolding.makeCaseFold(false);
|
||||
|
||||
} else if (arg.equalsIgnoreCase("SpecialCasing")) {
|
||||
GenerateCaseFolding.generateSpecialCasing();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("CompositionExclusions")) {
|
||||
GenerateData.generateCompExclusions();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedAge")) {
|
||||
GenerateData.generateAge("DerivedData/", "DerivedAge");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedCoreProperties")) {
|
||||
GenerateData.generateDerived(DERIVED_CORE, true, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedCoreProperties");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedNormalizationProperties")) {
|
||||
GenerateData.generateDerived(DERIVED_NORMALIZATION, true, GenerateData.HEADER_DERIVED, "DerivedData/",
|
||||
"DerivedNormalizationProperties" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("NormalizationTest")) {
|
||||
GenerateData.writeNormalizerTestSuite("DerivedData/", "NormalizationTest");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("PropertyAliases")) {
|
||||
GenerateData.generatePropertyAliases();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("PropList")) {
|
||||
GenerateData.generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + NEXT_ENUM,
|
||||
GenerateData.HEADER_EXTEND, "DerivedData/", "PropList");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("Scripts")) {
|
||||
GenerateData.generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM,
|
||||
GenerateData.HEADER_SCRIPTS, "DerivedData/", "Scripts");
|
||||
// OTHER TESTING
|
||||
|
||||
} else if (arg.equalsIgnoreCase("OtherDerivedProperties")) {
|
||||
//mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
|
||||
GenerateData.generateDerived(ALL, false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("AllBinary")) {
|
||||
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,
|
||||
GenerateData.HEADER_EXTEND, "OtherDerived/", "AllBinary");
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedGeneralCategoryTEST")) {
|
||||
GenerateData.generateVerticalSlice(CATEGORY+29, CATEGORY+32, GenerateData.HEADER_DERIVED,
|
||||
"DerivedData/", "DerivedGeneralCategory" );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("differences")) {
|
||||
GenerateData.listDifferences();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("partition")) {
|
||||
GenerateData.partitionProperties();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listAccents")) {
|
||||
GenerateData.listCombiningAccents();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listGreekVowels")) {
|
||||
GenerateData.listGreekVowels();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listKatakana")) {
|
||||
GenerateData.listKatakana();
|
||||
|
||||
/*
|
||||
} else if (arg.equalsIgnoreCase("DerivedFullNormalization")) {
|
||||
mask = Utility.setBits(0, DerivedProperty.GenNFD, DerivedProperty.GenNFKC);
|
||||
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedFullNormalization" );
|
||||
} else if (arg.equalsIgnoreCase("caseignorable")) {
|
||||
mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i);
|
||||
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "CaseIgnorable" );
|
||||
} else if (arg.equalsIgnoreCase("nfunsafestart")) {
|
||||
mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart);
|
||||
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "NFUnsafeStart");
|
||||
*/
|
||||
|
||||
} else {
|
||||
throw new IllegalArgumentException(" ! Unknown option -- see Main.java for options");
|
||||
}
|
||||
|
||||
|
||||
//checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
|
||||
//checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
|
||||
|
||||
|
||||
//GenerateData.generateDerived(Utility.setBits(0, DerivedProperty.PropMath, DerivedProperty.Mod_ID_Continue_NO_Cf),
|
||||
// GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedPropData2" );
|
||||
//GenerateData.generateVerticalSlice(SCRIPT, SCRIPT+1, "ScriptCommon" );
|
||||
//listStrings("LowerCase" , 0,0);
|
||||
//GenerateData.generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedData/", "DerivedPropData1" );
|
||||
|
||||
// AGE stuff
|
||||
//UCD ucd = UCD.make();
|
||||
//System.out.println(ucd.getAgeID(0x61));
|
||||
//System.out.println(ucd.getAgeID(0x2FA1D));
|
||||
|
||||
//
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -53,6 +53,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
}
|
||||
|
||||
public String valueName(int cp) {
|
||||
if (up.getValueType() == BINARY) return up.getName();
|
||||
return up.getValue(cp);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -49,9 +49,9 @@ public final class Normalizer implements UCD_Types {
|
|||
/**
|
||||
* Create a normalizer for a given form.
|
||||
*/
|
||||
public Normalizer(byte form) {
|
||||
this(form,"");
|
||||
}
|
||||
// public Normalizer(byte form) {
|
||||
// this(form,"");
|
||||
//}
|
||||
|
||||
/**
|
||||
* Return string name
|
||||
|
|
|
@ -26,13 +26,12 @@
|
|||
#
|
||||
# NOTE: The property value names are NOT unique across properties, especially
|
||||
# with loose matches. For example,
|
||||
#
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Alpha_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names:
|
||||
# cc means Combining_Class property, and
|
||||
# cc means the General_Category property value Control (cc)
|
||||
# In addition, some property names may be the same as some property value names.
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
# For more information, see UTR #24: Regular Expression Guidelines
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -168,13 +168,20 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
return lastSpace;
|
||||
}
|
||||
|
||||
private static final byte FAKERC = 63; // fake category for comparison
|
||||
private static final byte FAKELC = 63; // fake category for comparison
|
||||
private static final byte FAKENC = 64; // fake category for comparison
|
||||
|
||||
private byte getModCat(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lt || cat == Ll || cat == Lu) cat = FAKELC;
|
||||
if (cat == Cn && ucdData.isNoncharacter(cp)) cat = FAKENC;
|
||||
if (cat == UNASSIGNED && ucdData.isNoncharacter(cp)) cat = FAKENC;
|
||||
else if (breakByCategory) {
|
||||
if (cat == Lt || cat == Ll || cat == Lu) cat = FAKELC;
|
||||
} else {
|
||||
// MASH almost everything together
|
||||
if (cat != CONTROL && cat != FORMAT && cat != SURROGATE
|
||||
&& cat != PRIVATE_USE && cat != UNASSIGNED) cat = FAKERC;
|
||||
}
|
||||
return cat;
|
||||
}
|
||||
|
||||
|
@ -196,7 +203,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
byte s = status(cp);
|
||||
if (alwaysBreaks && s == INCLUDE) s = BREAK;
|
||||
if (s == INCLUDE && firstRealCp != -1) {
|
||||
if (breakByCategory && getModCat(cp) != firstRealCpCat) s = BREAK;
|
||||
if (getModCat(cp) != firstRealCpCat) s = BREAK;
|
||||
}
|
||||
|
||||
switch(s) {
|
||||
|
|
|
@ -22,6 +22,9 @@
|
|||
#
|
||||
# Third Field: The third field is a long name.
|
||||
#
|
||||
# In the case of ccc, their are 4 fields. The second field is numeric, third
|
||||
# is abbreviated, and fourth is long.
|
||||
#
|
||||
# With loose matching of property names, the case distinctions, whitespace,
|
||||
# and '_' are ignored.
|
||||
#
|
||||
|
|
67
tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt
Normal file
67
tools/unicodetools/com/ibm/text/UCD/SpecialCasingFooter.txt
Normal file
|
@ -0,0 +1,67 @@
|
|||
# ================================================================================
|
||||
# Conditional mappings
|
||||
# ================================================================================
|
||||
|
||||
# Special case for final form of sigma
|
||||
|
||||
03A3; 03C2; 03A3; 03A3; FINAL_SIGMA; # GREEK CAPITAL LETTER SIGMA
|
||||
|
||||
# Note: the following cases for non-final are already in the UnicodeData file.
|
||||
|
||||
# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
|
||||
# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
|
||||
# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
|
||||
|
||||
# Note: the following cases are not included, since they would case-fold in lowercasing
|
||||
|
||||
# 03C3; 03C2; 03A3; 03A3; FINAL_SIGMA; # GREEK SMALL LETTER SIGMA
|
||||
# 03C2; 03C3; 03A3; 03A3; NOT_FINAL_SIGMA; # GREEK SMALL LETTER FINAL SIGMA
|
||||
|
||||
# ================================================================================
|
||||
# Locale-sensitive mappings
|
||||
# ================================================================================
|
||||
|
||||
# Lithuanian
|
||||
|
||||
# Lithuanian retains the dot in a lowercase i when followed by accents.
|
||||
|
||||
# Remove DOT ABOVE after "i" with upper or titlecase
|
||||
|
||||
0307; 0307; ; ; lt AFTER_i # COMBINING DOT ABOVE
|
||||
|
||||
# Introduce an explicit dot above when lowercasing capital I's and J's
|
||||
# whenever there are more accents above
|
||||
# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
|
||||
|
||||
0049; 0069 0307; 0049; 0049; lt MORE_ABOVE # LATIN CAPITAL LETTER I
|
||||
004A; 006A 0307; 004A; 004A; lt MORE_ABOVE # LATIN CAPITAL LETTER J
|
||||
012E; 012F 0307; 012E; 012E; lt MORE_ABOVE # LATIN CAPITAL LETTER I WITH OGONEK
|
||||
00CC; 0069 0307 0300; 00CC; 00CC; lt # LATIN CAPITAL LETTER I WITH GRAVE
|
||||
00CD; 0069 0307 0301; 00CD; 00CD; lt # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
0128; 0069 0307 0303; 0128; 0128; lt # LATIN CAPITAL LETTER I WITH TILDE
|
||||
|
||||
# ================================================================================
|
||||
|
||||
# Turkish and Azeri
|
||||
|
||||
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||||
# The following rules handle those cases.
|
||||
|
||||
# When lowercasing, remove dot_ above in the sequence I + dot_ above, which will turn into i.
|
||||
# This matches the behavior of the canonically equivalent I-dot_above
|
||||
|
||||
0307; ; 0307; 0307; AFTER_I # COMBINING DOT ABOVE
|
||||
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
|
||||
0049; 0131; 0049; 0049; tr NOT_BEFORE_DOT; # LATIN CAPITAL LETTER I
|
||||
0049; 0131; 0049; 0049; az NOT_BEFORE_DOT; # LATIN CAPITAL LETTER I
|
||||
|
||||
# When uppercasing, i turns into a dotted capital I
|
||||
|
||||
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
|
||||
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
|
||||
|
||||
# Note: the following cases are already in the UnicodeData file.
|
||||
|
||||
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
|
||||
# 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
60
tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
Normal file
60
tools/unicodetools/com/ibm/text/UCD/SpecialCasingHeader.txt
Normal file
|
@ -0,0 +1,60 @@
|
|||
# SpecialCasing-6.txt
|
||||
#
|
||||
# Special Casing Properties
|
||||
#
|
||||
# This file is a supplement to the UnicodeData file.
|
||||
# It contains additional information about the casing of Unicode characters.
|
||||
# (For compatibility, the UnicodeData.txt file only contains case mappings for
|
||||
# characters where they are 1-1, and does not have locale-specific mappings.)
|
||||
# For more information, see
|
||||
# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
|
||||
#
|
||||
# ================================================================================
|
||||
# Format
|
||||
# ================================================================================
|
||||
# The entries in this file are in the following machine-readable format:
|
||||
#
|
||||
# <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
|
||||
#
|
||||
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more than
|
||||
# one character, they are separated by spaces.
|
||||
#
|
||||
# The <condition_list> is optional. Where present, it consists of one or more locales or contexts,
|
||||
# separated by spaces. In these conditions:
|
||||
# - A condition list overrides the normal behavior if all of the listed conditions are true.
|
||||
# - Case distinctions in the condition list are not significant.
|
||||
# - Conditions preceded by "NOT_" represent the negation of the condition.
|
||||
# - A cased letter is any character with general category = Ll or Lo or Lt
|
||||
# - An ignorable sequence is a sequence of *zero* or more characters from
|
||||
# the set {HYPHEN, SOFT HYPHEN, general category = Mn}.
|
||||
#
|
||||
# A locale is defined as:
|
||||
# <locale> := <ISO_639_code> ( "_" <ISO_3166_code> ( "_" <variant> )? )?
|
||||
# <ISO_3166_code> := 2-letter ISO country code,
|
||||
# <ISO_639_code> := 2-letter ISO language code
|
||||
#
|
||||
# A context is a locale or one of the following choices:
|
||||
# CFINAL: The character is not followed by a sequence consisting of
|
||||
# an ignorable sequence and then a cased letter.
|
||||
# CINITIAL: The character is not preceded by a sequence consisting of
|
||||
# a cased letter and an ignorable sequence.
|
||||
# FINAL_SIGMA: CFINAL and NOT_CINITIAL
|
||||
# TYPE_i: The character is "i" (0069), "j" (006A),
|
||||
# or has a canonical decomposition that begins with an "i" or "j"
|
||||
# but has no combining characters above (i.e., i-ogonek (012F),
|
||||
# i-tilde-below (1E2D), or i-dot-below (1ECB)).
|
||||
# AFTER_i: The last preceding base character was TYPE_i, and
|
||||
# no combining character class 230 (above) has intervened.
|
||||
# MORE_ABOVE: The character is followed by one or more characters of
|
||||
# combining class 230 (ABOVE) in the combining character sequence
|
||||
#
|
||||
# Other than as used to separate elements, spaces are to be ignored.
|
||||
#
|
||||
# Parsers of this file must be prepared to deal future additions to this format:
|
||||
# * Additional contexts
|
||||
# * Additional fields
|
||||
# ================================================================================
|
||||
|
||||
# ================================================================================
|
||||
# Unconditional mappings
|
||||
# ================================================================================
|
13
tools/unicodetools/com/ibm/text/UCD/SpecialCasingIota.txt
Normal file
13
tools/unicodetools/com/ibm/text/UCD/SpecialCasingIota.txt
Normal file
|
@ -0,0 +1,13 @@
|
|||
# IMPORTANT-when capitalizing iota-subscript (0345)
|
||||
# It MUST be in normalized form--moved to the end of any sequence of combining marks.
|
||||
# This is because logically it represents a following base character!
|
||||
# E.g. <iota_subscript> (<Mn> | <Mc> | <Me>)+ => (<Mn> | <Mc> | <Me>)+ <iota_subscript>
|
||||
# It should never be the first character in a word, so in titlecasing it can be left as is.
|
||||
|
||||
# The following cases are already in the UnicodeData file, so are only commented here.
|
||||
|
||||
# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI
|
||||
|
||||
# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
|
||||
# have special uppercases.
|
||||
# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -21,6 +21,7 @@ import java.text.SimpleDateFormat;
|
|||
import com.ibm.text.utility.*;
|
||||
|
||||
public class TestData implements UCD_Types {
|
||||
/*
|
||||
|
||||
public static void main (String[] args) throws IOException {
|
||||
System.out.println("START");
|
||||
|
@ -200,7 +201,6 @@ public class TestData implements UCD_Types {
|
|||
}
|
||||
output.close();
|
||||
}
|
||||
*/
|
||||
|
||||
public static void generateCompExclusions() throws IOException {
|
||||
PrintWriter output = Utility.openPrintWriter("CompositionExclusionsDelta.txt");
|
||||
|
@ -246,7 +246,7 @@ public class TestData implements UCD_Types {
|
|||
System.out.println(ucd.getData(0x100000-3));
|
||||
if (true) return;
|
||||
String test2 = ucd.getName(0x2A6D6);
|
||||
//*/
|
||||
//* /
|
||||
|
||||
|
||||
PrintWriter output = Utility.openPrintWriter(file);
|
||||
|
@ -485,5 +485,5 @@ public class TestData implements UCD_Types {
|
|||
"E\u0304\u0300",
|
||||
"E\u0300\u0304",
|
||||
};
|
||||
|
||||
//*/
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $
|
||||
* $Date: 2001/08/31 00:30:17 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -25,12 +25,6 @@ public final class TestNormalization {
|
|||
static PrintWriter out = null;
|
||||
static BufferedReader in = null;
|
||||
|
||||
static Normalizer nfc;
|
||||
static Normalizer nfd;
|
||||
static Normalizer nfkc;
|
||||
static Normalizer nfkd;
|
||||
static UCD ucd;
|
||||
|
||||
static BitSet charsListed = new BitSet(0x110000);
|
||||
static int errorCount = 0;
|
||||
static int lineErrorCount = 0;
|
||||
|
@ -39,18 +33,14 @@ public final class TestNormalization {
|
|||
|
||||
public static void main(String[] args) throws java.io.IOException {
|
||||
System.out.println("Creating Normalizers");
|
||||
ucd = UCD.make("");
|
||||
Main.setUCD();
|
||||
|
||||
nfc = new Normalizer(Normalizer.NFC);
|
||||
nfd = new Normalizer(Normalizer.NFD);
|
||||
nfkc = new Normalizer(Normalizer.NFKC);
|
||||
nfkd = new Normalizer(Normalizer.NFKD);
|
||||
|
||||
String x = UTF32.valueOf32(0x10000);
|
||||
check("NFC", nfc, x);
|
||||
check("NFD", nfd, x);
|
||||
check("NFKC", nfkc, x);
|
||||
check("NFKD", nfkd, x);
|
||||
check("NFC", Main.nfc, x);
|
||||
check("NFD", Main.nfd, x);
|
||||
check("NFKC", Main.nfkc, x);
|
||||
check("NFKD", Main.nfkd, x);
|
||||
|
||||
|
||||
out = new PrintWriter(
|
||||
|
@ -97,36 +87,36 @@ public final class TestNormalization {
|
|||
}
|
||||
|
||||
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
errorCount += check("NFCa", nfc, parts[1], parts[0]);
|
||||
errorCount += check("NFCb", nfc, parts[1], parts[1]);
|
||||
errorCount += check("NFCc", nfc, parts[1], parts[2]);
|
||||
errorCount += check("NFCa", Main.nfc, parts[1], parts[0]);
|
||||
errorCount += check("NFCb", Main.nfc, parts[1], parts[1]);
|
||||
errorCount += check("NFCc", Main.nfc, parts[1], parts[2]);
|
||||
|
||||
// c4 == NFC(c4) == NFC(c5)
|
||||
errorCount += check("NFCd", nfc, parts[3], parts[3]);
|
||||
errorCount += check("NFCe", nfc, parts[3], parts[4]);
|
||||
errorCount += check("NFCd", Main.nfc, parts[3], parts[3]);
|
||||
errorCount += check("NFCe", Main.nfc, parts[3], parts[4]);
|
||||
|
||||
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
errorCount += check("NFDa", nfd, parts[2], parts[0]);
|
||||
errorCount += check("NFDb", nfd, parts[2], parts[1]);
|
||||
errorCount += check("NFDc", nfd, parts[2], parts[2]);
|
||||
errorCount += check("NFDa", Main.nfd, parts[2], parts[0]);
|
||||
errorCount += check("NFDb", Main.nfd, parts[2], parts[1]);
|
||||
errorCount += check("NFDc", Main.nfd, parts[2], parts[2]);
|
||||
|
||||
// c5 == NFD(c4) == NFD(c5)
|
||||
errorCount += check("NFDd", nfd, parts[4], parts[3]);
|
||||
errorCount += check("NFDe", nfd, parts[4], parts[4]);
|
||||
errorCount += check("NFDd", Main.nfd, parts[4], parts[3]);
|
||||
errorCount += check("NFDe", Main.nfd, parts[4], parts[4]);
|
||||
|
||||
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
errorCount += check("NFKCa", nfkc, parts[3], parts[0]);
|
||||
errorCount += check("NFKCb", nfkc, parts[3], parts[1]);
|
||||
errorCount += check("NFKCc", nfkc, parts[3], parts[2]);
|
||||
errorCount += check("NFKCd", nfkc, parts[3], parts[3]);
|
||||
errorCount += check("NFKCe", nfkc, parts[3], parts[4]);
|
||||
errorCount += check("NFKCa", Main.nfkc, parts[3], parts[0]);
|
||||
errorCount += check("NFKCb", Main.nfkc, parts[3], parts[1]);
|
||||
errorCount += check("NFKCc", Main.nfkc, parts[3], parts[2]);
|
||||
errorCount += check("NFKCd", Main.nfkc, parts[3], parts[3]);
|
||||
errorCount += check("NFKCe", Main.nfkc, parts[3], parts[4]);
|
||||
|
||||
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
errorCount += check("NFKDa", nfkd, parts[4], parts[0]);
|
||||
errorCount += check("NFKDb", nfkd, parts[4], parts[1]);
|
||||
errorCount += check("NFKDc", nfkd, parts[4], parts[2]);
|
||||
errorCount += check("NFKDd", nfkd, parts[4], parts[3]);
|
||||
errorCount += check("NFKDe", nfkd, parts[4], parts[4]);
|
||||
errorCount += check("NFKDa", Main.nfkd, parts[4], parts[0]);
|
||||
errorCount += check("NFKDb", Main.nfkd, parts[4], parts[1]);
|
||||
errorCount += check("NFKDc", Main.nfkd, parts[4], parts[2]);
|
||||
errorCount += check("NFKDd", Main.nfkd, parts[4], parts[3]);
|
||||
errorCount += check("NFKDe", Main.nfkd, parts[4], parts[4]);
|
||||
}
|
||||
System.out.println("Total errors in file: " + errorCount
|
||||
+ ", lines: " + lineErrorCount);
|
||||
|
@ -160,21 +150,21 @@ public final class TestNormalization {
|
|||
}
|
||||
String otherList = "";
|
||||
if (!base.equals(other)) {
|
||||
otherList = "(" + ucd.getCodeAndName(other) + ")";
|
||||
otherList = "(" + Main.ucd.getCodeAndName(other) + ")";
|
||||
}
|
||||
out.println("DIFF " + type + ": "
|
||||
+ ucd.getCodeAndName(base) + " != "
|
||||
+ Main.ucd.getCodeAndName(base) + " != "
|
||||
+ type
|
||||
+ otherList
|
||||
+ " == " + ucd.getCodeAndName(trans)
|
||||
+ " == " + Main.ucd.getCodeAndName(trans)
|
||||
+ temp
|
||||
);
|
||||
return 1;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new ChainException("DIFF " + type + ": "
|
||||
+ ucd.getCodeAndName(base) + " != "
|
||||
+ type + "(" + ucd.getCodeAndName(other) + ")", new Object[]{}, e);
|
||||
+ Main.ucd.getCodeAndName(base) + " != "
|
||||
+ type + "(" + Main.ucd.getCodeAndName(other) + ")", new Object[]{}, e);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -188,10 +178,10 @@ public final class TestNormalization {
|
|||
if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing));
|
||||
if (charsListed.get(missing)) continue;
|
||||
String x = UTF32.valueOf32(missing);
|
||||
errorCount += check("NFC", nfc, x);
|
||||
errorCount += check("NFD", nfd, x);
|
||||
errorCount += check("NFKC", nfkc, x);
|
||||
errorCount += check("NFKD", nfkd, x);
|
||||
errorCount += check("NFC", Main.nfc, x);
|
||||
errorCount += check("NFD", Main.nfd, x);
|
||||
errorCount += check("NFKC", Main.nfkc, x);
|
||||
errorCount += check("NFKD", Main.nfkd, x);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.8 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.9 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -48,6 +48,7 @@ public final class UCD implements UCD_Types {
|
|||
if (version.indexOf('.') < 0) throw new IllegalArgumentException("Version must be of form 3.1.1");
|
||||
UCD result = (UCD)versionCache.get(version);
|
||||
if (result == null) {
|
||||
//System.out.println(Utility.getStack());
|
||||
result = new UCD();
|
||||
result.fillFromFile(version);
|
||||
versionCache.put(version, result);
|
||||
|
@ -569,7 +570,8 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
|
||||
static String getCombiningClassID_fromIndex (short index, byte style) {
|
||||
if (style == NORMAL || style == NUMBER) return String.valueOf(index & 0xFF);
|
||||
index &= 0xFF;
|
||||
if (style == NORMAL || style == NUMBER) return String.valueOf(index);
|
||||
String s = "Fixed";
|
||||
switch (index) {
|
||||
case 0: s = style < LONG ? "NR" : "NotReordered"; break;
|
||||
|
@ -619,7 +621,7 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
|
||||
public static String getDecompositionTypeID_fromIndex(byte prop) {
|
||||
return getDecompositionTypeID_fromIndex(NORMAL);
|
||||
return getDecompositionTypeID_fromIndex(prop, NORMAL);
|
||||
}
|
||||
public static String getDecompositionTypeID_fromIndex(byte prop, byte style) {
|
||||
return style == SHORT ? UCD_Names.SHORT_DT[prop] : UCD_Names.DT[prop];
|
||||
|
@ -1069,7 +1071,7 @@ to guarantee identifier closure.
|
|||
uData.joiningType = JT_T;
|
||||
}
|
||||
if (!didJoiningHack && uData.joiningType != old) {
|
||||
System.out.println("HACK: Setting "
|
||||
System.out.println("HACK " + foundVersion + ": Setting "
|
||||
+ UCD_Names.LONG_JOINING_TYPE[uData.joiningType]
|
||||
+ ": " + Utility.hex(cp) + " " + uData.name);
|
||||
didJoiningHack = true;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.10 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.11 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -119,6 +119,8 @@ final class UCD_Names implements UCD_Types {
|
|||
"Unified_Ideograph",
|
||||
"Other_Default_Ignorable_Code_Point",
|
||||
"Deprecated",
|
||||
"Soft_Dotted",
|
||||
"Logical_Order_Exception",
|
||||
};
|
||||
|
||||
static final String[] SHORT_BP = {
|
||||
|
@ -151,6 +153,8 @@ final class UCD_Names implements UCD_Types {
|
|||
"UIdeo",
|
||||
"ODI",
|
||||
"Dep",
|
||||
"SD",
|
||||
"LOE",
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -28,6 +28,11 @@ public interface UCD_Types {
|
|||
DERIVED_NORMALIZATION = 4,
|
||||
DERIVED_ALL = 6,
|
||||
ALL = (byte)-1;
|
||||
|
||||
static final byte
|
||||
NON_ENUMERATED = -1,
|
||||
ENUMERATED = 0,
|
||||
BINARY = 1;
|
||||
|
||||
/*
|
||||
0 Code value in 4-digit hexadecimal format.
|
||||
|
@ -180,7 +185,9 @@ public interface UCD_Types {
|
|||
UnifiedIdeograph = 26,
|
||||
Reserved_Cf_Code_Point = 27,
|
||||
Deprecated = 28,
|
||||
LIMIT_BINARY_PROPERTIES = 29;
|
||||
Soft_Dotted = 29,
|
||||
Logical_Order_Exception = 30,
|
||||
LIMIT_BINARY_PROPERTIES = 31;
|
||||
|
||||
/*
|
||||
static final int
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.UnicodeSet;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public abstract class UnicodeProperty implements UCD_Types {
|
||||
|
||||
protected UCD ucd;
|
||||
protected boolean isStandard = true;
|
||||
protected byte type = NOT_DERIVED;
|
||||
private byte valueType = BINARY;
|
||||
protected boolean hasUnassigned = false;
|
||||
protected boolean valueVaries = false;
|
||||
protected boolean isBinary = true;
|
||||
protected byte defaultValueStyle = SHORT;
|
||||
protected byte defaultPropertyStyle = LONG;
|
||||
protected String valueName;
|
||||
|
@ -29,11 +33,17 @@ public abstract class UnicodeProperty implements UCD_Types {
|
|||
public void setStandard(boolean in) { isStandard = in; }
|
||||
|
||||
/**
|
||||
* What type is it?
|
||||
* What type is it? DERIVED..
|
||||
*/
|
||||
public byte getType() { return type; }
|
||||
public void setType(byte in) { type = in; }
|
||||
|
||||
/**
|
||||
* Does getProperty vary in contents? ENUMERATED,...
|
||||
*/
|
||||
public byte getValueType() { return valueType; }
|
||||
public void setValueType(byte in) { valueType = in; }
|
||||
|
||||
/**
|
||||
* Does it apply to any unassigned characters?
|
||||
*/
|
||||
|
@ -66,7 +76,7 @@ public abstract class UnicodeProperty implements UCD_Types {
|
|||
public String getProperty(byte style) {
|
||||
if (style == NORMAL) style = defaultPropertyStyle;
|
||||
switch (style) {
|
||||
case LONG: return name.toString();
|
||||
case LONG: return Utility.getUnskeleton(name.toString(), false);
|
||||
case SHORT: return shortName.toString();
|
||||
case NUMBER: return numberName.toString();
|
||||
default: throw new IllegalArgumentException("Bad property: " + style);
|
||||
|
@ -78,7 +88,7 @@ public abstract class UnicodeProperty implements UCD_Types {
|
|||
public void setProperty(byte style, String in) {
|
||||
if (style == NORMAL) style = defaultPropertyStyle;
|
||||
switch (style) {
|
||||
case LONG: name = in; break;
|
||||
case LONG: name = Utility.getUnskeleton(in, false); break;
|
||||
case SHORT: shortName = in; break;
|
||||
case NUMBER: numberName = in; break;
|
||||
default: throw new IllegalArgumentException("Bad property: " + style);
|
||||
|
@ -98,10 +108,10 @@ public abstract class UnicodeProperty implements UCD_Types {
|
|||
public String getValue(int cp) { return getValue(cp, NORMAL); }
|
||||
|
||||
public void setValue(byte style, String in) {
|
||||
if (valueVaries) throw new IllegalArgumentException("Can't set varying value: " + style);
|
||||
if (getValueType() != BINARY) throw new IllegalArgumentException("Can't set varying value: " + style);
|
||||
if (style == NORMAL) style = defaultValueStyle;
|
||||
switch (style) {
|
||||
case LONG: valueName = in; break;
|
||||
case LONG: valueName = Utility.getUnskeleton(in, false); break;
|
||||
case SHORT: shortValueName = in; break;
|
||||
case NUMBER: numberValueName = in; break;
|
||||
default: throw new IllegalArgumentException("Bad value: " + style);
|
||||
|
@ -109,12 +119,12 @@ public abstract class UnicodeProperty implements UCD_Types {
|
|||
}
|
||||
|
||||
public String getValue(byte style) {
|
||||
if (valueVaries) throw new IllegalArgumentException(
|
||||
if (getValueType() != BINARY) throw new IllegalArgumentException(
|
||||
"Value varies in " + getName(LONG) + "; call getValue(cp)");
|
||||
try {
|
||||
if (style == NORMAL) style = defaultValueStyle;
|
||||
switch (style) {
|
||||
case LONG: return valueName.toString();
|
||||
case LONG: return Utility.getUnskeleton(valueName.toString(), false);
|
||||
case SHORT: return shortValueName.toString();
|
||||
case NUMBER: return numberValueName.toString();
|
||||
default: throw new IllegalArgumentException("Bad property: " + style);
|
||||
|
@ -124,17 +134,27 @@ public abstract class UnicodeProperty implements UCD_Types {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Does getProperty vary in contents?
|
||||
*/
|
||||
public boolean valueVaries() { return valueVaries; }
|
||||
public void setValueVaries(boolean in) { valueVaries = in; }
|
||||
|
||||
/**
|
||||
* Does it have the propertyValue?
|
||||
*/
|
||||
abstract boolean hasValue(int cp);
|
||||
|
||||
/**
|
||||
* Get the set of characters it contains
|
||||
*/
|
||||
|
||||
private UnicodeSet cache = null;
|
||||
|
||||
public UnicodeSet getSet() {
|
||||
if (cache == null) {
|
||||
cache = new UnicodeSet();
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (hasValue(cp)) cache.add(cp);
|
||||
}
|
||||
}
|
||||
return (UnicodeSet) cache.clone();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////
|
||||
|
||||
// Old Name for compatibility
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
|
||||
* $Date: 2001/12/06 00:05:53 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -16,6 +16,7 @@ import java.io.*;
|
|||
import java.util.*;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UnicodeSet;
|
||||
|
||||
final class UnifiedBinaryProperty extends UnicodeProperty {
|
||||
int majorProp;
|
||||
|
@ -30,6 +31,54 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
|
|||
return getCached(propMask, ucd);
|
||||
}
|
||||
|
||||
public static UnicodeProperty make(String propAndValue, UCD ucd) {
|
||||
return make(getPropmask(propAndValue, ucd), ucd);
|
||||
}
|
||||
|
||||
public static UnicodeSet getSet(int propMask, UCD ucd) {
|
||||
UnicodeProperty up = make(propMask, ucd);
|
||||
return up.getSet();
|
||||
}
|
||||
|
||||
public static UnicodeSet getSet(String propAndValue, UCD ucd) {
|
||||
return getSet(getPropmask(propAndValue, ucd), ucd);
|
||||
}
|
||||
|
||||
private static Map propNameCache = null;
|
||||
|
||||
public static int getPropmask(String propAndValue, UCD ucd) {
|
||||
|
||||
// cache the names
|
||||
if (propNameCache == null) {
|
||||
System.out.println("Caching Property Names");
|
||||
propNameCache = new HashMap();
|
||||
|
||||
for (int i = 0; i < LIMIT_ENUM; ++i) {
|
||||
UnicodeProperty up = UnifiedBinaryProperty.make(i, ucd);
|
||||
if (up == null) continue;
|
||||
if (!up.isStandard()) continue;
|
||||
if (up.getValueType() != BINARY) continue;
|
||||
String shortValue = Utility.getSkeleton(up.getValue(SHORT));
|
||||
String shortName = Utility.getSkeleton(up.getProperty(SHORT));
|
||||
String longValue = Utility.getSkeleton(up.getValue(LONG));
|
||||
String longName = Utility.getSkeleton(up.getProperty(LONG));
|
||||
Integer result = new Integer(i);
|
||||
propNameCache.put(longName + "=" + longValue, result);
|
||||
propNameCache.put(longName + "=" + shortValue, result);
|
||||
propNameCache.put(shortName + "=" + longValue, result);
|
||||
propNameCache.put(shortName + "=" + shortValue, result);
|
||||
}
|
||||
System.out.println("Done Caching");
|
||||
}
|
||||
|
||||
propAndValue = Utility.getSkeleton(propAndValue);
|
||||
Integer indexObj = (Integer) propNameCache.get(propAndValue);
|
||||
if (indexObj == null) {
|
||||
throw new IllegalArgumentException("No property found for " + propAndValue);
|
||||
}
|
||||
return indexObj.intValue();
|
||||
}
|
||||
|
||||
static Map cache = new HashMap();
|
||||
static UCD lastUCD = null;
|
||||
static int lastPropMask = -1;
|
||||
|
@ -76,7 +125,16 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
|
|||
shortValueName = _getValue(SHORT);
|
||||
numberValueName = _getValue(NUMBER);
|
||||
defaultValueStyle = _getDefaultStyle();
|
||||
System.out.println("Value = " + getValue(defaultValueStyle));
|
||||
|
||||
if (majorProp == (BINARY_PROPERTIES>>8)) {
|
||||
name = valueName;
|
||||
shortName = shortValueName;
|
||||
defaultPropertyStyle = defaultValueStyle;
|
||||
valueName = "YES";
|
||||
shortValueName = "Y";
|
||||
}
|
||||
|
||||
// System.out.println("Value = " + getValue(defaultValueStyle));
|
||||
// System.out.println(majorProp + ", " + propValue + ", " + name);
|
||||
// dp = new DerivedProperty(ucd);
|
||||
}
|
||||
|
@ -247,9 +305,7 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
|
|||
return UCD_Names.LONG_JOINING_TYPE[propValue];
|
||||
case JOINING_GROUP>>8: if (propValue >= LIMIT_JOINING_GROUP) break;
|
||||
return ucd.getJoiningGroupID_fromIndex((byte)propValue);
|
||||
case BINARY_PROPERTIES>>8: if (propValue >= LIMIT_BINARY_PROPERTIES) break;
|
||||
if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex((byte)propValue);
|
||||
return UCD_Names.SHORT_BP[propValue];
|
||||
case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
|
||||
case SCRIPT>>8: if (propValue >= LIMIT_SCRIPT) break;
|
||||
if (style != SHORT) return ucd.getScriptID_fromIndex((byte)propValue);
|
||||
return UCD_Names.ABB_SCRIPT[propValue];
|
||||
|
@ -263,7 +319,7 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
|
|||
*/
|
||||
}
|
||||
} catch (RuntimeException e) {
|
||||
throw new ChainException("Illegal property Number {0}, {1}", new Object[]{
|
||||
throw new ChainException("Illegal property Number* {0}, {1}", new Object[]{
|
||||
new Integer(majorProp), new Integer(propValue)}, e);
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}, {1}", new Object[]{
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
|
||||
* $Date: 2001/12/06 00:05:52 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2001/12/13 23:35:57 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -16,6 +16,8 @@ package com.ibm.text.utility;
|
|||
import java.util.*;
|
||||
import java.text.*;
|
||||
import java.io.*;
|
||||
import com.ibm.text.UnicodeSet;
|
||||
import com.ibm.text.UCD.*;
|
||||
|
||||
public final class Utility { // COMMON UTILITIES
|
||||
|
||||
|
@ -85,7 +87,65 @@ public final class Utility { // COMMON UTILITIES
|
|||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* These routines use the Java functions, because they only need to act on ASCII.
|
||||
* Removes space, _, and lowercases.
|
||||
*/
|
||||
|
||||
public static String getSkeleton(String source) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
boolean gotOne = false;
|
||||
// remove spaces, '_'
|
||||
// we can do this with char, since no surrogates are involved
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char ch = source.charAt(i);
|
||||
if (ch == '_' || ch == ' ') {
|
||||
gotOne = true;
|
||||
} else {
|
||||
char ch2 = Character.toLowerCase(ch);
|
||||
if (ch2 != ch) {
|
||||
gotOne = true;
|
||||
result.append(ch2);
|
||||
} else {
|
||||
result.append(ch);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!gotOne) return source; // avoid string creation
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* These routines use the Java functions, because they only need to act on ASCII
|
||||
* Changes space, - into _, inserts _ between lower and UPPER.
|
||||
*/
|
||||
|
||||
public static String getUnskeleton(String source, boolean titlecaseStart) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
int lastCat = -1;
|
||||
boolean haveFirstCased = true;
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char c = source.charAt(i);
|
||||
if (c == ' ' || c == '-') c = '_';
|
||||
int cat = Character.getType(c);
|
||||
if (lastCat == Character.LOWERCASE_LETTER && cat == Character.UPPERCASE_LETTER) {
|
||||
result.append('_');
|
||||
}
|
||||
if (haveFirstCased && (cat == Character.LOWERCASE_LETTER
|
||||
|| cat == Character.TITLECASE_LETTER || cat == Character.UPPERCASE_LETTER)) {
|
||||
if (titlecaseStart) {
|
||||
c = Character.toUpperCase(c);
|
||||
}
|
||||
haveFirstCased = false;
|
||||
}
|
||||
result.append(c);
|
||||
lastCat = cat;
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
public static String findSubstring(String source, Set target, boolean invert) {
|
||||
Iterator it = target.iterator();
|
||||
while (it.hasNext()) {
|
||||
|
@ -178,6 +238,10 @@ public final class Utility { // COMMON UTILITIES
|
|||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string containing count copies of s.
|
||||
* If count <= 0, returns "".
|
||||
*/
|
||||
public static String repeat(String s, int count) {
|
||||
if (count <= 0) return "";
|
||||
if (count == 1) return s;
|
||||
|
@ -260,6 +324,10 @@ public final class Utility { // COMMON UTILITIES
|
|||
return output.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits a string containing divider into pieces, storing in output
|
||||
* and returns the number of pieces.
|
||||
*/
|
||||
public static int split(String s, char divider, String[] output) {
|
||||
int last = 0;
|
||||
int current = 0;
|
||||
|
@ -407,19 +475,22 @@ public final class Utility { // COMMON UTILITIES
|
|||
return (aEnd - aStart) - (bEnd - bStart);
|
||||
}
|
||||
|
||||
public static String join(int[] array, String sep) {
|
||||
/**
|
||||
* Joins an array together, using divider between the pieces
|
||||
*/
|
||||
public static String join(int[] array, String divider) {
|
||||
String result = "{";
|
||||
for (int i = 0; i < array.length; ++i) {
|
||||
if (i != 0) result += sep;
|
||||
if (i != 0) result += divider;
|
||||
result += array[i];
|
||||
}
|
||||
return result + "}";
|
||||
}
|
||||
|
||||
public static String join(long[] array, String sep) {
|
||||
public static String join(long[] array, String divider) {
|
||||
String result = "{";
|
||||
for (int i = 0; i < array.length; ++i) {
|
||||
if (i != 0) result += sep;
|
||||
if (i != 0) result += divider;
|
||||
result += array[i];
|
||||
}
|
||||
return result + "}";
|
||||
|
@ -506,16 +577,18 @@ public final class Utility { // COMMON UTILITIES
|
|||
}
|
||||
|
||||
public static BufferedReader openUnicodeFile(String filename, String version, boolean show) throws IOException {
|
||||
String name = getMostRecentUnicodeDataFile(filename, version, show);
|
||||
String name = getMostRecentUnicodeDataFile(filename, version, true, show);
|
||||
if (name == null) return null;
|
||||
return new BufferedReader(new FileReader(name),32*1024);
|
||||
}
|
||||
|
||||
public static String getMostRecentUnicodeDataFile(String filename, String version, boolean show) throws IOException {
|
||||
public static String getMostRecentUnicodeDataFile(String filename, String version,
|
||||
boolean acceptLatest, boolean show) throws IOException {
|
||||
// get all the files in the directory
|
||||
|
||||
int compValue = acceptLatest ? 0 : 1;
|
||||
for (int i = 0; i < searchPath.length; ++i) {
|
||||
if (version.length() != 0 && version.compareTo(searchPath[i]) < 0) continue;
|
||||
if (version.length() != 0 && version.compareTo(searchPath[i]) < compValue) continue;
|
||||
|
||||
String directoryName = DATA_DIR + File.separator + searchPath[i] + "-Update" + File.separator;
|
||||
if (show) System.out.println("Trying: '" + directoryName + "', '" + filename + "'");
|
||||
|
@ -549,6 +622,9 @@ public final class Utility { // COMMON UTILITIES
|
|||
log.println("</head><body>");
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces all occurances of piece with replacement, and returns new String
|
||||
*/
|
||||
public static String replace(String source, String piece, String replacement) {
|
||||
while (true) {
|
||||
int pos = source.indexOf(piece);
|
||||
|
@ -556,4 +632,30 @@ public final class Utility { // COMMON UTILITIES
|
|||
source = source.substring(0,pos) + source.substring(pos + piece.length());
|
||||
}
|
||||
}
|
||||
|
||||
public static String getStack() {
|
||||
Exception e = new Exception();
|
||||
StringWriter sw = new StringWriter();
|
||||
PrintWriter pw = new PrintWriter(sw);
|
||||
e.printStackTrace(pw);
|
||||
pw.flush();
|
||||
return "Showing Stack with fake " + sw.getBuffer().toString();
|
||||
}
|
||||
|
||||
public static void showSetNames(String prefix, UnicodeSet set, boolean all, UCD ucd) {
|
||||
int count = set.getRangeCount();
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int start = set.getRangeStart(i);
|
||||
int end = set.getRangeEnd(i);
|
||||
if (all) {
|
||||
for (int cp = start; cp <= end; ++cp) {
|
||||
if (!set.contains(cp)) continue;
|
||||
System.out.println(prefix + ucd.getCodeAndName(cp));
|
||||
}
|
||||
} else {
|
||||
System.out.println(prefix + ucd.getCodeAndName(start) +
|
||||
((start != end) ? (".." + ucd.getCodeAndName(end)) : ""));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,7 +3,54 @@ WARNING!!
|
|||
These directories contain some Unicode tools used to build various files,
|
||||
and to check the consistency of the Unicode releases.
|
||||
|
||||
They are NOT production level code, and should never be used in programs.
|
||||
The API is subject to change without notice, and will not be maintained.
|
||||
The source is uncommented, and not well structured -- classic spaghetti style.
|
||||
There is no build mechanism.
|
||||
- They are NOT production level code, and should never be used in programs.
|
||||
- The API is subject to change without notice, and will not be maintained.
|
||||
- The source is uncommented, and not well structured -- classic spaghetti style.
|
||||
- There is no build mechanism.
|
||||
- I have not checked to make sure it works on Unix; probably the only change that
|
||||
needs to be made is to fix the file separator.
|
||||
|
||||
Instructions:
|
||||
|
||||
1. You must edit UCD_Types at the top, to set the directories for the build:
|
||||
|
||||
public static final String DATA_DIR = "C:\\DATA\\";
|
||||
public static final String BIN_DIR = DATA_DIR + "BIN\\";
|
||||
public static final String GEN_DIR = DATA_DIR + "GEN\\";
|
||||
|
||||
Make sure that each of these directories exist. Also make sure that
|
||||
<GEN_DIR>/DerivedData
|
||||
<GEN_DIR>/DerivedData/ExtractedProperties
|
||||
|
||||
|
||||
2. Download all of the UnicodeData files for each version into DATA_DIR
|
||||
The folder names must be of the form: "3.2.0-Update"
|
||||
|
||||
|
||||
3. For each version X (like 3.1.0), run
|
||||
|
||||
java version X build
|
||||
|
||||
This builds an compressed format of all the UCD data (except blocks and Unihan)
|
||||
into the BIN directory. Don't worry about the voluminous console messages, unless one says
|
||||
"FAIL".
|
||||
|
||||
|
||||
4. To build all of the files for a particular version X, run
|
||||
|
||||
java version X all
|
||||
|
||||
To build a particular file, like CaseFolding, use that file name instead of all
|
||||
|
||||
java version X CaseFolding
|
||||
|
||||
To change the D version, edit the link in GenerateData.java:
|
||||
|
||||
static final int dVersion = 2; // change to fix the generated file D version. If less than zero, no "d"
|
||||
|
||||
|
||||
5. To run basic consistency checking, run:
|
||||
|
||||
java version X verify
|
||||
|
||||
Don't worry about any console messages except those that say FAIL.
|
Loading…
Add table
Reference in a new issue