Revisions for specialcasing & misc fixes

X-SVN-Rev: 7349
This commit is contained in:
Mark Davis 2001-12-13 23:36:29 +00:00
parent ac085286cd
commit a903b84867
26 changed files with 1742 additions and 814 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/12/13 23:35:54 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -26,12 +26,9 @@ public class BuildNames implements UCD_Types {
static final boolean DEBUG = true;
static UCD ucd;
public static void main(String[] args) throws IOException {
ucd = UCD.make();
Main.setUCD();
collectWords();
}
@ -85,8 +82,8 @@ public class BuildNames implements UCD_Types {
int used = 0;
int sum = 0;
for (int i = 0; i < 0x10FFFF; ++i) {
if (ucd.hasComputableName(i)) continue;
String name = transform(ucd.getName(i));
if (Main.ucd.hasComputableName(i)) continue;
String name = transform(Main.ucd.getName(i));
sum += name.length();

View file

@ -0,0 +1,41 @@
# Case Folding Properties
#
# This file is a supplement to the UnicodeData file.
# It provides a case folding mapping generated from the Unicode Character Database.
# If all characters are mapped according to the full mapping below, then
# case differences (according to UnicodeData.txt and SpecialCasing.txt)
# are eliminated.
#
# The data supports both implementations that require simple case foldings
# (where string lengths don't change), and implementations that allow full case folding
# (where string lengths may grow). Note that where they can be supported, the
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
#
# NOTE: case folding does not preserve normalization formats!
#
# For information on case folding, see
# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
#
# ================================================================================
# Format
# ================================================================================
# The entries in this file are in the following machine-readable format:
#
# <code>; <status>; <mapping>; # <name>
#
# The status field is:
# C: common case folding, common mappings shared by both simple and full mappings.
# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
# S: simple case folding, mappings to single characters where different from F.
# I: special case for dotted uppercase I and dotless lowercase i
# - If this mapping is included, the result is case-insensitive, but dotless and dotted I's are not distinguished.
# - If this mapping is excluded, the result is not fully case-insensitive, but dotless and dotted I's are distinguished.
#
# Usage:
# A. To do a simple case folding, use the mappings with status C + S + I.
# B. To do a full case folding, use the mappings with status C + F + I.
# The mappings with status I can be omitted depending on the desired case-folding
# behavior. (The default option is to retain them.)
#
# =================================================================

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.9 $
* $Date: 2001/12/13 23:35:54 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -87,8 +87,6 @@ public final class DerivedProperty implements UCD_Types {
}
*/
private UnicodeProperty[] dprops = new UnicodeProperty[50];
private Normalizer[] nf = new Normalizer[4];
private Normalizer nfd, nfc, nfkd, nfkc;
static final String[] CaseNames = {
"Uppercase",
@ -99,7 +97,7 @@ public final class DerivedProperty implements UCD_Types {
Normalizer nfx;
ExDProp(int i) {
type = DERIVED_NORMALIZATION;
nfx = nf[i];
nfx = Main.nf[i];
name = "Expands_On_" + nfx.getName();
shortName = "XO_" + nfx.getName();
header = "# Derived Property: " + name
@ -123,7 +121,7 @@ public final class DerivedProperty implements UCD_Types {
NF_UnsafeStartProp(int i) {
isStandard = false;
type = DERIVED_NORMALIZATION;
nfx = nf[i];
nfx = Main.nf[i];
name = nfx.getName() + "_UnsafeStart";
shortName = nfx.getName() + "_SS";
header = "# Derived Property: " + name
@ -159,7 +157,7 @@ public final class DerivedProperty implements UCD_Types {
case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
}
filter = bitsets[1] != null;
nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
Main.nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
name = Names[i-NFC_Leading];
shortName = SNames[i-NFC_Leading];
@ -193,19 +191,19 @@ public final class DerivedProperty implements UCD_Types {
GenDProp (int i) {
isStandard = false;
valueVaries = true;
setValueType(NON_ENUMERATED);
type = DERIVED_NORMALIZATION;
nfx = nf[i];
nfx = Main.nf[i];
name = nfx.getName();
String compName = "the character itself";
if (i == NFKC || i == NFD) {
name += "-NFC";
nfComp = nfc;
nfComp = Main.nfc;
compName = "NFC for the character";
} else if (i == NFKD) {
name += "-NFD";
nfComp = nfd;
nfComp = Main.nfd;
compName = "NFD for the character";
}
header = "# Derived Property: " + name
@ -269,9 +267,9 @@ public final class DerivedProperty implements UCD_Types {
String MAYBE;
Normalizer nfx;
QuickDProp (int i) {
valueVaries = true;
setValueType((i == NFC || i == NFKC) ? ENUMERATED : BINARY);
type = DERIVED_NORMALIZATION;
nfx = nf[i];
nfx = Main.nf[i];
NO = nfx.getName() + "_NO";
MAYBE = nfx.getName() + "_MAYBE";
name = nfx.getName() + "_QuickCheck";
@ -291,11 +289,6 @@ public final class DerivedProperty implements UCD_Types {
};
{
nfd = nf[0] = new Normalizer(Normalizer.NFD);
nfc = nf[1] = new Normalizer(Normalizer.NFC);
nfkd = nf[2] = new Normalizer(Normalizer.NFKD);
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
dprops[i] = new ExDProp(i-ExpandsOnNFD);
}
@ -493,7 +486,7 @@ of characters, the first of which has a non-zero combining class.
dprops[FC_NFKC_Closure] = new UnicodeProperty() {
{
type = DERIVED_NORMALIZATION;
valueVaries = true;
setValueType(NON_ENUMERATED);
name = "FC_NFKC_Closure";
shortName = "FC_NFKC";
header = "# Derived Property: " + name
@ -503,8 +496,8 @@ of characters, the first of which has a non-zero combining class.
}
public String getValue(int cp, byte style) {
if (!ucdData.isRepresented(cp)) return "";
String b = nfkc.normalize(fold(cp));
String c = nfkc.normalize(fold(b));
String b = Main.nfkc.normalize(fold(cp));
String c = Main.nfkc.normalize(fold(b));
if (c.equals(b)) return "";
return "FNC; " + Utility.hex(c);
} // default
@ -516,7 +509,7 @@ of characters, the first of which has a non-zero combining class.
type = DERIVED_NORMALIZATION;
isStandard = false;
name = "FC_NFC_Closure";
valueVaries = true;
setValueType(NON_ENUMERATED);
shortName = "FC_NFC";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));"
@ -525,8 +518,8 @@ of characters, the first of which has a non-zero combining class.
}
public String getValue(int cp, byte style) {
if (!ucdData.isRepresented(cp)) return "";
String b = nfc.normalize(fold(cp));
String c = nfc.normalize(fold(b));
String b = Main.nfc.normalize(fold(cp));
String c = Main.nfc.normalize(fold(b));
if (c.equals(b)) return "";
return "FN; " + Utility.hex(c);
} // default
@ -603,8 +596,9 @@ of characters, the first of which has a non-zero combining class.
dprops[Type_i] = new UnicodeProperty() {
{
type = DERIVED_CORE;
name = "Soft_Dotted";
shortName = "SDot";
isStandard = false;
name = "DSoft_Dotted";
shortName = "DSDot";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: all characters whose canonical decompositions end with a combining character sequence that"
+ "\r\n# - starts with i or j"
@ -613,21 +607,24 @@ of characters, the first of which has a non-zero combining class.
;
}
boolean hasValue(int cp) {
if (cp == 'i' || cp == 'j') return true;
if (!nfkd.hasDecomposition(cp)) return false;
String decomp = nfd.normalize(cp);
if (hasSoftDot(cp)) return true;
if (!Main.nfkd.hasDecomposition(cp)) return false;
String decomp = Main.nfd.normalize(cp);
boolean ok = false;
for (int i = decomp.length()-1; i >= 0; --i) {
char ch = decomp.charAt(i);
int ch = UTF16.charAt(decomp, i);
int cc = ucdData.getCombiningClass(ch);
if (cc == 230) return false;
if (cc == 0) {
if (ch == 'i' || ch == 'j') ok = true;
else return false;
if (!hasSoftDot(ch)) return false;
ok = true;
}
}
return ok;
}
boolean hasSoftDot(int ch) {
return ch == 'i' || ch == 'j' || ch == 0x0268 || ch == 0x0456 || ch == 0x0458;
}
};
dprops[Case_Ignorable] = new UnicodeProperty() {
@ -666,7 +663,7 @@ of characters, the first of which has a non-zero combining class.
for (int i = 0; i < dprops.length; ++i) {
UnicodeProperty up = dprops[i];
if (up == null) continue;
if (up.valueVaries()) continue;
if (up.getValueType() != BINARY) continue;
up.setValue(NUMBER, "1");
up.setValue(SHORT, "Y");
up.setValue(LONG, "YES");
@ -681,11 +678,11 @@ of characters, the first of which has a non-zero combining class.
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
// if (true) throw new IllegalArgumentException("FIX nf[2]");
// if (true) throw new IllegalArgumentException("FIX Main.nf[2]");
if (!nf[NFKD].normalizationDiffers(cp)) return Lo;
if (!Main.nf[NFKD].normalizationDiffers(cp)) return Lo;
String norm = nf[NFKD].normalize(cp);
String norm = Main.nf[NFKD].normalize(cp);
int cp2;
boolean gotUpper = false;
boolean gotLower = false;
@ -723,8 +720,8 @@ of characters, the first of which has a non-zero combining class.
}
public static void test() {
UCD ucd = UCD.make();
DerivedProperty dprop = new DerivedProperty(ucd);
Main.setUCD();
DerivedProperty dprop = new DerivedProperty(Main.ucd);
/*
for (int j = 0; j < LIMIT; ++j) {
System.out.println();
@ -735,9 +732,9 @@ of characters, the first of which has a non-zero combining class.
for (int cp = 0xA0; cp < 0xFF; ++cp) {
System.out.println();
System.out.println(ucd.getCodeAndName(cp));
System.out.println(Main.ucd.getCodeAndName(cp));
for (int j = 0; j < DERIVED_PROPERTY_LIMIT; ++j) {
String prop = make(j, ucd).getValue(cp);
String prop = make(j, Main.ucd).getValue(cp);
if (prop.length() != 0) System.out.println("\t" + prop);
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedPropertyLister.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.7 $
* $Date: 2001/12/13 23:35:56 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -34,7 +34,7 @@ final class DerivedPropertyLister extends PropertyLister {
this.ucdData = ucd;
// this.dprop = new DerivedProperty(ucd);
uprop = DerivedProperty.make(propMask, ucd);
varies = uprop.valueVaries();
varies = uprop.getValueType() != BINARY;
width = super.minPropertyWidth();
switch (propMask) {
@ -56,7 +56,7 @@ final class DerivedPropertyLister extends PropertyLister {
}
public String valueName(int cp) {
if (uprop.valueVaries()) return uprop.getValue(cp, LONG);
if (uprop.getValueType() != BINARY) return uprop.getValue(cp, LONG);
return uprop.getProperty(LONG);
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateCaseFolding.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
* $Date: 2001/12/13 23:35:56 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -15,38 +15,72 @@ package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.UTF16;
import com.ibm.text.utility.*;
public class GenerateCaseFolding implements UCD_Types {
public static boolean DEBUG = false;
public static UCD ucd = UCD.make("");
public static void main(String[] args) throws java.io.IOException {
makeCaseFold();
public static boolean COMMENT_DIFFS = false; // ON if we want a comment on mappings != lowercase
public static boolean PICK_SHORT = false; // picks short value for SIMPLE if in FULL, changes weighting
public static boolean NF_CLOSURE = false; // picks short value for SIMPLE if in FULL, changes weighting
// PICK_SHORT & NF_CLOSURE = false for old style
/*public static void main(String[] args) throws java.io.IOException {
makeCaseFold(arg[0]);
//getAge();
}
public static void makeCaseFold() throws java.io.IOException {
*/
static PrintWriter log;
public static void makeCaseFold(boolean normalized) throws java.io.IOException {
PICK_SHORT = NF_CLOSURE = normalized;
Main.setUCD();
log = Utility.openPrintWriter("CaseFoldingLog" + GenerateData.getFileSuffix(true));
System.out.println("Writing Log: " + "CaseFoldingLog" + GenerateData.getFileSuffix(true));
System.out.println("Making Full Data");
Map fullData = getCaseFolding(true);
Map fullData = getCaseFolding(true, NF_CLOSURE);
Utility.fixDot();
System.out.println("Making Simple Data");
Map simpleData = getCaseFolding(false);
Map simpleData = getCaseFolding(false, NF_CLOSURE);
// write the data
Utility.fixDot();
System.out.println("Writing");
String filename = "CaseFolding";
if (normalized) filename += "-Normalized";
String directory = "DerivedData/";
PrintWriter out = Utility.openPrintWriter(directory + filename + GenerateData.getFileSuffix(true));
GenerateData.generateBat(directory, filename, GenerateData.getFileSuffix(true));
out.println("# CaseFolding" + GenerateData.getFileSuffix(false));
out.println("#");
out.println("# Generated: " + new Date() + ", MD");
Utility.appendFile("CaseFoldingHeader.txt", false, out);
/*
PrintWriter out = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream("CaseFoldingSample.txt"),
new FileOutputStream(directory + fileRoot + GenerateData.getFileSuffix()),
"UTF8"),
4*1024));
*/
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
Utility.dot(ch);
for (int ch = 0; ch < 0x10FFFF; ++ch) {
if (!charsUsed.get(ch)) continue;
String rFull = (String)fullData.get(UTF32.valueOf32(ch));
String rSimple = (String)simpleData.get(UTF32.valueOf32(ch));
if (rFull == null && rSimple == null) continue;
if (rFull != null && rFull.equals(rSimple)) {
if (rFull != null && rFull.equals(rSimple)
|| (PICK_SHORT && UTF16.countCodePoint(rFull) == 1)) {
String type = "C";
if (ch == 0x130 || ch == 0x131) type = "I";
drawLine(out, ch, type, rFull);
@ -60,35 +94,63 @@ public class GenerateCaseFolding implements UCD_Types {
}
}
out.close();
log.close();
}
static void drawLine(PrintWriter out, int ch, String type, String result) {
String comment = "";
if (COMMENT_DIFFS) {
String lower = Main.ucd.getCase(UTF16.valueOf(ch), FULL, LOWER);
if (!lower.equals(result)) {
String upper = Main.ucd.getCase(UTF16.valueOf(ch), FULL, UPPER);
String lower2 = Main.ucd.getCase(UTF16.valueOf(ch), FULL, LOWER);
if (lower.equals(lower2)) {
comment = "[Diff " + Utility.hex(lower, " ") + "] ";
} else {
Utility.fixDot();
System.out.println("PROBLEM WITH: " + Main.ucd.getCodeAndName(ch));
comment = "[DIFF " + Utility.hex(lower, " ") + ", " + Utility.hex(lower2, " ") + "] ";
}
}
}
out.println(Utility.hex(ch)
+ "; " + type +
"; " + Utility.hex(result, " ") +
"; # " + ucd.getName(ch));
+ "; " + type
+ "; " + Utility.hex(result, " ")
+ "; # " + comment + Main.ucd.getName(ch));
}
static int probeCh = 0x01f0;
static String shower = UTF16.valueOf(probeCh);
static Map getCaseFolding(boolean full) throws java.io.IOException {
static Map getCaseFolding(boolean full, boolean nfClose) throws java.io.IOException {
Map data = new TreeMap();
Map repChar = new TreeMap();
//String option = "";
// get the equivalence classes
for (int ch = 0; ch < 0x10FFFF; ++ch) {
if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
if (!ucd.isRepresented(ch)) continue;
getClosure(ch, data, full);
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
Utility.dot(ch);
//if ((ch & 0x3FF) == 0) System.out.println(Utility.hex(ch));
if (!Main.ucd.isRepresented(ch)) continue;
getClosure(ch, data, full, nfClose);
}
// get the representative characters
Iterator it = data.keySet().iterator();
while (it.hasNext()) {
String s = (String) it.next();
Set set = (Set) data.get(s);
show = set.contains(shower);
if (show) {
Utility.fixDot();
System.out.println(toString(set));
}
// Pick the best available representative
String rep = null;
int repGood = 0;
String dup = null;
@ -104,30 +166,63 @@ public class GenerateCaseFolding implements UCD_Types {
dup = s2;
}
}
if (rep == null) System.err.println("No representative for: " + toString(set));
else if (repGood < 128) {
System.err.println("Non-optimal!!: "
+ ucd.getName(rep) + ", " + toString(set,true));
if (rep == null) {
Utility.fixDot();
System.err.println("No representative for: " + toString(set));
} else if ((repGood & (NFC_FORMAT | ISLOWER)) != (NFC_FORMAT | ISLOWER)) {
String message = "";
if ((repGood & NFC_FORMAT) == 0) {
message += " [NOT NFC FORMAT]";
}
if ((repGood & ISLOWER) == 0) {
message += " [NOT LOWERCASE]";
}
Utility.fixDot();
log.println("Non-Optimal Representative " + message);
log.println(" Rep:\t" + Main.ucd.getCodeAndName(rep));
log.println(" Set:\t" + toString(set,true, true));
}
// Add it for all the elements of the set
it2 = set.iterator();
while (it2.hasNext()) {
String s2 = (String)it2.next();
if (s2.length() == 1 && !s2.equals(rep)) repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
if (UTF16.countCodePoint(s2) == 1 && !s2.equals(rep)) {
repChar.put(UTF32.getCodePointSubstring(s2,0), rep);
charsUsed.set(UTF16.charAt(s2, 0));
}
}
}
return repChar;
}
static BitSet charsUsed = new BitSet();
static boolean show = false;
static final int NFC_FORMAT = 64;
static final int ISLOWER = 128;
static int goodness(String s, boolean full) {
if (s == null) return 0;
int result = s.length();
if (s.equals(lower(upper(s, full), full))) result |= 128;
if (s.equals(NFC.normalize(s))) result |= 64;
int result = 32-s.length();
if (!PICK_SHORT) {
result = s.length();
}
if (!full) result <<= 8;
String low = lower(upper(s, full), full);
if (s.equals(low)) result |= ISLOWER;
else if (PICK_SHORT && Main.nfd.normalize(s).equals(Main.nfd.normalize(low))) result |= ISLOWER;
if (s.equals(Main.nfc.normalize(s))) result |= NFC_FORMAT;
if (show) {
Utility.fixDot();
System.out.println(Utility.hex(result) + ", " + Main.ucd.getCodeAndName(s));
}
return result;
}
static Normalizer NFC = new Normalizer(Normalizer.NFC);
/*
static HashSet temp = new HashSet();
static void normalize(HashSet set) {
@ -151,33 +246,33 @@ public class GenerateCaseFolding implements UCD_Types {
/*
String
String lower1 = ucd.getLowercase(ch);
String lower2 = ucd.toLowercase(ch,option);
String lower1 = Main.ucd.getLowercase(ch);
String lower2 = Main.ucd.toLowercase(ch,option);
char ch2 = ucd.getLowercase(ucd.getUppercase(ch).charAt(0)).charAt(0);
//String lower1 = String.valueOf(ucd.getLowercase(ch));
//String lower = ucd.toLowercase(ch2,option);
String upper = ucd.toUppercase(ch2,option);
String lowerUpper = ucd.toLowercase(upper,option);
//String title = ucd.toTitlecase(ch2,option);
//String lowerTitle = ucd.toLowercase(upper,option);
char ch2 = Main.ucd.getLowercase(Main.ucd.getUppercase(ch).charAt(0)).charAt(0);
//String lower1 = String.valueOf(Main.ucd.getLowercase(ch));
//String lower = Main.ucd.toLowercase(ch2,option);
String upper = Main.ucd.toUppercase(ch2,option);
String lowerUpper = Main.ucd.toLowercase(upper,option);
//String title = Main.ucd.toTitlecase(ch2,option);
//String lowerTitle = Main.ucd.toLowercase(upper,option);
if (ch != ch2 || lowerUpper.length() != 1 || ch != lowerUpper.charAt(0)) { //
output.println(Utility.hex(ch)
+ "; " + (lowerUpper.equals(lower1) ? "L" : lowerUpper.equals(lower2) ? "S" : "E")
+ "; " + Utility.hex(lowerUpper," ")
+ ";\t#" + ucd.getName(ch)
+ ";\t#" + Main.ucd.getName(ch)
);
//if (!lowerUpper.equals(lower)) {
// output.println("Warning1: " + Utility.hex(lower) + " " + ucd.getName(lower));
// output.println("Warning1: " + Utility.hex(lower) + " " + Main.ucd.getName(lower));
//}
//if (!lowerUpper.equals(lowerTitle)) {
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + ucd.getName(lowerTitle));
// output.println("Warning2: " + Utility.hex(lowerTitle) + " " + Main.ucd.getName(lowerTitle));
//}
}
*/
static void getClosure(int ch, Map data, boolean full) {
static void getClosure(int ch, Map data, boolean full, boolean nfClose) {
String charStr = UTF32.valueOf32(ch);
String lowerStr = lower(charStr, full);
String titleStr = title(charStr, full);
@ -202,7 +297,13 @@ public class GenerateCaseFolding implements UCD_Types {
while (it.hasNext()) {
String s = (String) it.next();
// do funny stuff since we can't modify set while iterating
//if (add(set, NFC.normalize(s), data)) continue main;
// We don't do this because if the source is not normalized, we don't want to normalize
if (nfClose) {
if (add(set, Main.nfd.normalize(s), data)) continue main;
if (add(set, Main.nfc.normalize(s), data)) continue main;
if (add(set, Main.nfkd.normalize(s), data)) continue main;
if (add(set, Main.nfkc.normalize(s), data)) continue main;
}
if (add(set, lower(s, full), data)) continue main;
if (add(set, title(s, full), data)) continue main;
if (add(set, upper(s, full), data)) continue main;
@ -216,31 +317,34 @@ public class GenerateCaseFolding implements UCD_Types {
return result.replace('\u03C2', '\u03C3'); // HACK for lower
}
// These functions are no longer necessary, since UCD is parameterized,
// These functions are no longer necessary, since Main.ucd is parameterized,
// but it's not worth changing
static String lower2(String s, boolean full) {
if (!full) {
/*if (!full) {
if (s.length() != 1) return s;
return ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
return Main.ucd.getCase(UTF32.char32At(s,0), SIMPLE, LOWER);
}
return ucd.getCase(s, FULL, LOWER);
*/
return Main.ucd.getCase(s, full ? FULL : SIMPLE, LOWER);
}
static String upper(String s, boolean full) {
if (!full) {
/* if (!full) {
if (s.length() != 1) return s;
return ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
return Main.ucd.getCase(UTF32.char32At(s,0), FULL, UPPER);
}
return ucd.getCase(s, SIMPLE, UPPER);
*/
return Main.ucd.getCase(s, full ? FULL : SIMPLE, UPPER);
}
static String title(String s, boolean full) {
if (!full) {
/*if (!full) {
if (s.length() != 1) return s;
return ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
return Main.ucd.getCase(UTF32.char32At(s,0), FULL, TITLE);
}
return ucd.getCase(s, SIMPLE, TITLE);
*/
return Main.ucd.getCase(s, full ? FULL : SIMPLE, TITLE);
}
static boolean add(Set set, String s, Map data) {
@ -261,28 +365,173 @@ public class GenerateCaseFolding implements UCD_Types {
}
static String toString(Set set) {
String result = "{";
Iterator it2 = set.iterator();
boolean first = true;
while (it2.hasNext()) {
String s2 = (String) it2.next();
if (!first) result += ", ";
first = false;
result += Utility.hex(s2, " ");
}
return result + "}";
return toString(set, false, false);
}
static String toString(Set set, boolean t) {
static String toString(Set set, boolean name, boolean crtab) {
String result = "{";
Iterator it2 = set.iterator();
boolean first = true;
while (it2.hasNext()) {
String s2 = (String) it2.next();
if (!first) result += ", ";
if (!first) {
if (crtab) {
result += ";\r\n\t";
} else {
result += "; ";
}
}
first = false;
result += ucd.getName(s2);
if (name) {
result += Main.ucd.getCodeAndName(s2);
} else {
result += Utility.hex(s2, " ");
}
}
return result + "}";
}
static boolean specialNormalizationDiffers(int ch) {
if (ch == 0x00DF) return true; // es-zed
return Main.nfkd.normalizationDiffers(ch);
}
static String specialNormalization(String s) {
if (s.equals("\u00DF")) return "ss";
return Main.nfkd.normalize(s);
}
static boolean isExcluded(int ch) {
if (ch == 0x130) return true; // skip LATIN CAPITAL LETTER I WITH DOT ABOVE
if (ch == 0x0132 || ch == 0x0133) return true; // skip IJ, ij
if (ch == 0x037A) return true; // skip GREEK YPOGEGRAMMENI
if (0x249C <= ch && ch <= 0x24B5) return true; // skip PARENTHESIZED LATIN SMALL LETTER A..
if (0x20A8 <= ch && ch <= 0x217B) return true; // skip Rupee..
byte type = Main.ucd.getDecompositionType(ch);
if (type == COMPAT_SQUARE) return true;
//if (type == COMPAT_UNSPECIFIED) return true;
return false;
}
static void generateSpecialCasing() throws IOException {
Main.setUCD();
Map sorted = new TreeMap();
PrintWriter log = Utility.openPrintWriter("SpecialCasingExceptions" + GenerateData.getFileSuffix(true));
for (int ch = 0; ch <= 0x10FFFF; ++ch) {
Utility.dot(ch);
if (!Main.ucd.isRepresented(ch)) continue;
if (!specialNormalizationDiffers(ch)) continue;
String lower = Main.nfc.normalize(Main.ucd.getCase(ch, SIMPLE, LOWER));
String upper = Main.nfc.normalize(Main.ucd.getCase(ch, SIMPLE, UPPER));
String title = Main.nfc.normalize(Main.ucd.getCase(ch, SIMPLE, TITLE));
String chstr = UTF16.valueOf(ch);
String decomp = specialNormalization(chstr);
String flower = Main.nfc.normalize(Main.ucd.getCase(decomp, SIMPLE, LOWER));
String fupper = Main.nfc.normalize(Main.ucd.getCase(decomp, SIMPLE, UPPER));
String ftitle = Main.nfc.normalize(Main.ucd.getCase(decomp, SIMPLE, TITLE));
String base = Main.nfc.normalize(decomp);
String blower = Main.nfc.normalize(specialNormalization(lower));
String bupper = Main.nfc.normalize(specialNormalization(upper));
String btitle = Main.nfc.normalize(specialNormalization(title));
if (ch == 0x249c) {
System.out.println("Code: " + Main.ucd.getCodeAndName(ch));
System.out.println("Decomp: " + Main.ucd.getCodeAndName(decomp));
System.out.println("Base: " + Main.ucd.getCodeAndName(base));
System.out.println("SLower: " + Main.ucd.getCodeAndName(lower));
System.out.println("FLower: " + Main.ucd.getCodeAndName(flower));
System.out.println("BLower: " + Main.ucd.getCodeAndName(blower));
System.out.println("STitle: " + Main.ucd.getCodeAndName(title));
System.out.println("FTitle: " + Main.ucd.getCodeAndName(ftitle));
System.out.println("BTitle: " + Main.ucd.getCodeAndName(btitle));
System.out.println("SUpper: " + Main.ucd.getCodeAndName(upper));
System.out.println("FUpper: " + Main.ucd.getCodeAndName(fupper));
System.out.println("BUpper: " + Main.ucd.getCodeAndName(bupper));
}
// presumably if there is a single code point, it would already be in the simple mappings
if (UTF16.countCodePoint(flower) == 1 && UTF16.countCodePoint(fupper) == 1
&& UTF16.countCodePoint(title) == 1) continue;
// if there is no change from the base, skip
if (flower.equals(base) && fupper.equals(base) && ftitle.equals(base)) continue;
// fix special cases
// if (flower.equals(blower) && fupper.equals(bupper) && ftitle.equals(btitle)) continue;
if (flower.equals(blower)) flower = lower;
if (fupper.equals(bupper)) fupper = upper;
if (ftitle.equals(btitle)) ftitle = title;
// if there are no changes from the original, or the expanded original, skip
if (flower.equals(lower) && fupper.equals(upper) && ftitle.equals(title)) continue;
String name = Main.ucd.getName(ch);
String mapping = Utility.hex(ch)
+ "; " + Utility.hex(flower.equals(base) ? chstr : flower)
+ "; " + Utility.hex(ftitle.equals(base) ? chstr : ftitle)
+ "; " + Utility.hex(fupper.equals(base) ? chstr : fupper)
+ "; # " + Main.ucd.getName(ch);
int order = name.equals("LATIN SMALL LETTER SHARP S") ? 1
: name.indexOf("ARMENIAN SMALL LIGATURE") >= 0 ? 3
: name.indexOf("LIGATURE") >= 0 ? 2
: name.indexOf("GEGRAMMENI") < 0 ? 4
: UTF16.countCodePoint(ftitle) == 1 ? 5
: UTF16.countCodePoint(fupper) == 2 ? 6
: 7;
// special exclusions
if (isExcluded(ch)) {
log.println("# " + mapping);
} else {
sorted.put(new Integer((order << 24) | ch), mapping);
}
}
log.close();
System.out.println("Writing");
PrintWriter out = Utility.openPrintWriter("DerivedData/SpecialCasing" + GenerateData.getFileSuffix(true));
GenerateData.generateBat("DerivedData/", "SpecialCasing", GenerateData.getFileSuffix(true));
Utility.appendFile("SpecialCasingHeader.txt", true, out);
Iterator it = sorted.keySet().iterator();
int lastOrder = -1;
while (it.hasNext()) {
Integer key = (Integer) it.next();
String line = (String) sorted.get(key);
int order = key.intValue() >> 24;
if (order != lastOrder) {
lastOrder = order;
out.println();
boolean skipLine = false;
switch(order) {
case 1:
out.println("# The German es-zed is special--the normal mapping is to SS.");
out.println("# Note: the titlecase should never occur in practice. It is equal to titlecase(uppercase(<es-zed>))");
break;
case 2: out.println("# Ligatures"); break;
case 3: skipLine = true; break;
case 4: out.println("# No corresponding uppercase precomposed character"); break;
case 5: Utility.appendFile("SpecialCasingIota.txt", true, out); break;
case 6: out.println("# Some characters with YPOGEGRAMMENI are also have no corresponding titlecases"); break;
case 7: skipLine = true; break;
}
if (!skipLine) out.println();
}
out.println(line);
}
Utility.appendFile("SpecialCasingFooter.txt", true, out);
out.close();
}
}

File diff suppressed because it is too large Load diff

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MLStreamWriter.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -245,7 +245,7 @@ public class MLStreamWriter extends Writer {
boolean isHTML;
ArrayList stack = new ArrayList();
boolean inElement = false;
Normalizer formC = new Normalizer(Normalizer.NFC);
Normalizer formC = new Normalizer(Normalizer.NFC, "");
int len;
int maxLineLength = 60;
// later, add better line end management, indenting

View file

@ -5,17 +5,59 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.7 $
* $Date: 2001/12/13 23:35:56 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import java.util.Date;
public final class Main {
static String ucdVersion = "";
public final class Main implements UCD_Types {
static String ucdVersion = UCD.latestVersion;
static UCD ucd;
static Normalizer nfc;
static Normalizer nfd;
static Normalizer nfkc;
static Normalizer nfkd;
static Normalizer[] nf = new Normalizer[4];
static void setUCD() {
ucd = UCD.make(Main.ucdVersion);
nfd = nf[NFD] = new Normalizer(Normalizer.NFD, Main.ucdVersion);
nfc = nf[NFC] = new Normalizer(Normalizer.NFC, Main.ucdVersion);
nfkd = nf[NFKD] = new Normalizer(Normalizer.NFKD, Main.ucdVersion);
nfkc = nf[NFKC] = new Normalizer(Normalizer.NFKC, Main.ucdVersion);
System.out.println("Loaded UCD" + ucd.getVersion() + " " + (new Date(Main.ucd.getDate())));
}
static final String[] ALL_FILES = {
"CaseFolding",
"CompositionExclusions",
"DerivedBidiClass",
"DerivedBinaryProperties",
"DerivedCombiningClass",
"DerivedCoreProperties",
"DerivedDecompositionType",
"DerivedEastAsianWidth",
"DerivedGeneralCategory",
"DerivedJoiningGroup",
"DerivedJoiningType",
"DerivedLineBreak",
"DerivedNormalizationProperties",
"DerivedNumericType",
"DerivedNumericValues",
"NormalizationTest",
"PropertyAliases",
"PropList",
"Scripts",
"SpecialCasing",
"DerivedAge",
//"OtherDerivedProperties",
};
public static void main (String[] args) throws Exception {
@ -26,19 +68,19 @@ public final class Main {
Utility.fixDot();
System.out.println("Argument: " + args[i]);
if (arg.equalsIgnoreCase("all")) {
//checkCase();
if (arg.equalsIgnoreCase("verify")) {
VerifyUCD.verify();
VerifyUCD.checkCanonicalProperties();
VerifyUCD.CheckCaseFold();
VerifyUCD.checkAgainstUInfo();
} else if (arg.equalsIgnoreCase("build")) {
ConvertUCD.main(new String[]{ucdVersion});
} else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
} else if (arg.equalsIgnoreCase("build")) ConvertUCD.main(new String[]{ucdVersion});
else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null);
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main();
else if (arg.equalsIgnoreCase("compareBlueberry")) VerifyUCD.compareBlueberry();
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
@ -52,19 +94,180 @@ public final class Main {
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();
else if (arg.equalsIgnoreCase("Generate")) GenerateData.main(ucdVersion, Utility.split(args[++i],','));
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
/*else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
*/
else {
System.out.println("Unknown option -- must be one of the following (case-insensitive)");
System.out.println("generateXML, checkCase, checkCanonicalProperties, CheckCaseFold,");
System.out.println("VerifyIDN, NFTest, test1, ");
// System.out.println(checkAgainstUInfo,");
System.out.println("checkScripts, IdentifierTest, writeNormalizerTestSuite");
}
else extras(new String[] {arg});
}
}
public static void extras (String[] args) throws Exception {
//ubp = new UnifiedBinaryProperty(ucd);
boolean expanding = false;
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
if (arg.charAt(0) == '#') return; // skip rest of line
long mask = 0;
Utility.fixDot();
if (expanding) System.out.println("Argument: " + args[i]);
if (arg.equalsIgnoreCase("All")) {
// Append all args at end
String[] temp = new String[args.length + ALL_FILES.length];
System.arraycopy(args, 0, temp, 0, args.length);
System.arraycopy(ALL_FILES, 0, temp, args.length, ALL_FILES.length);
args = temp;
expanding = true;
// EXTRACTED PROPERTIES
} else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
GenerateData.generateVerticalSlice(BIDI_CLASS, BIDI_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedBidiClass");
} else if (arg.equalsIgnoreCase("DerivedBinaryProperties")) {
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES+1, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedBinaryProperties" );
} else if (arg.equalsIgnoreCase("DerivedCombiningClass")) {
GenerateData.generateVerticalSlice(COMBINING_CLASS, COMBINING_CLASS+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedCombiningClass" );
} else if (arg.equalsIgnoreCase("DerivedDecompositionType")) {
GenerateData.generateVerticalSlice(DECOMPOSITION_TYPE, DECOMPOSITION_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedDecompositionType" );
} else if (arg.equalsIgnoreCase("DerivedEastAsianWidth")) {
GenerateData.generateVerticalSlice(EAST_ASIAN_WIDTH, EAST_ASIAN_WIDTH+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedEastAsianWidth" );
} else if (arg.equalsIgnoreCase("DerivedGeneralCategory")) {
GenerateData.generateVerticalSlice(CATEGORY, CATEGORY+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedGeneralCategory" );
} else if (arg.equalsIgnoreCase("DerivedJoiningGroup")) {
GenerateData.generateVerticalSlice(JOINING_GROUP, JOINING_GROUP+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedJoiningGroup" );
} else if (arg.equalsIgnoreCase("DerivedJoiningType")) {
GenerateData.generateVerticalSlice(JOINING_TYPE, JOINING_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedJoiningType" );
} else if (arg.equalsIgnoreCase("DerivedLineBreak")) {
GenerateData.generateVerticalSlice(LINE_BREAK, LINE_BREAK+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedLineBreak" );
} else if (arg.equalsIgnoreCase("DerivedNumericType")) {
GenerateData.generateVerticalSlice(NUMERIC_TYPE, NUMERIC_TYPE+NEXT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedNumericType" );
} else if (arg.equalsIgnoreCase("DerivedNumericValues")) {
GenerateData.generateVerticalSlice(LIMIT_ENUM, LIMIT_ENUM, GenerateData.HEADER_DERIVED,
"DerivedData/DerivedExtractedProperties/", "DerivedNumericValues" );
// OTHER STANDARD PROPERTIES
} else if (arg.equalsIgnoreCase("CaseFolding")) {
GenerateCaseFolding.makeCaseFold(true);
GenerateCaseFolding.makeCaseFold(false);
} else if (arg.equalsIgnoreCase("SpecialCasing")) {
GenerateCaseFolding.generateSpecialCasing();
} else if (arg.equalsIgnoreCase("CompositionExclusions")) {
GenerateData.generateCompExclusions();
} else if (arg.equalsIgnoreCase("DerivedAge")) {
GenerateData.generateAge("DerivedData/", "DerivedAge");
} else if (arg.equalsIgnoreCase("DerivedCoreProperties")) {
GenerateData.generateDerived(DERIVED_CORE, true, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedCoreProperties");
} else if (arg.equalsIgnoreCase("DerivedNormalizationProperties")) {
GenerateData.generateDerived(DERIVED_NORMALIZATION, true, GenerateData.HEADER_DERIVED, "DerivedData/",
"DerivedNormalizationProperties" );
} else if (arg.equalsIgnoreCase("NormalizationTest")) {
GenerateData.writeNormalizerTestSuite("DerivedData/", "NormalizationTest");
} else if (arg.equalsIgnoreCase("PropertyAliases")) {
GenerateData.generatePropertyAliases();
} else if (arg.equalsIgnoreCase("PropList")) {
GenerateData.generateVerticalSlice(BINARY_PROPERTIES + White_space, BINARY_PROPERTIES + NEXT_ENUM,
GenerateData.HEADER_EXTEND, "DerivedData/", "PropList");
} else if (arg.equalsIgnoreCase("Scripts")) {
GenerateData.generateVerticalSlice(SCRIPT+1, SCRIPT + NEXT_ENUM,
GenerateData.HEADER_SCRIPTS, "DerivedData/", "Scripts");
// OTHER TESTING
} else if (arg.equalsIgnoreCase("OtherDerivedProperties")) {
//mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
GenerateData.generateDerived(ALL, false, GenerateData.HEADER_DERIVED, "OtherData/", "OtherDerivedProperties");
} else if (arg.equalsIgnoreCase("AllBinary")) {
GenerateData.generateVerticalSlice(BINARY_PROPERTIES, BINARY_PROPERTIES + NEXT_ENUM,
GenerateData.HEADER_EXTEND, "OtherDerived/", "AllBinary");
} else if (arg.equalsIgnoreCase("DerivedGeneralCategoryTEST")) {
GenerateData.generateVerticalSlice(CATEGORY+29, CATEGORY+32, GenerateData.HEADER_DERIVED,
"DerivedData/", "DerivedGeneralCategory" );
} else if (arg.equalsIgnoreCase("differences")) {
GenerateData.listDifferences();
} else if (arg.equalsIgnoreCase("partition")) {
GenerateData.partitionProperties();
} else if (arg.equalsIgnoreCase("listAccents")) {
GenerateData.listCombiningAccents();
} else if (arg.equalsIgnoreCase("listGreekVowels")) {
GenerateData.listGreekVowels();
} else if (arg.equalsIgnoreCase("listKatakana")) {
GenerateData.listKatakana();
/*
} else if (arg.equalsIgnoreCase("DerivedFullNormalization")) {
mask = Utility.setBits(0, DerivedProperty.GenNFD, DerivedProperty.GenNFKC);
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedFullNormalization" );
} else if (arg.equalsIgnoreCase("caseignorable")) {
mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i);
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "CaseIgnorable" );
} else if (arg.equalsIgnoreCase("nfunsafestart")) {
mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart);
GenerateData.generateDerived(mask, GenerateData.HEADER_DERIVED, "OtherData/", "NFUnsafeStart");
*/
} else {
throw new IllegalArgumentException(" ! Unknown option -- see Main.java for options");
}
//checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
//checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
//GenerateData.generateDerived(Utility.setBits(0, DerivedProperty.PropMath, DerivedProperty.Mod_ID_Continue_NO_Cf),
// GenerateData.HEADER_DERIVED, "DerivedData/", "DerivedPropData2" );
//GenerateData.generateVerticalSlice(SCRIPT, SCRIPT+1, "ScriptCommon" );
//listStrings("LowerCase" , 0,0);
//GenerateData.generateVerticalSlice(0, LIMIT_ENUM, SKIP_SPECIAL, PROPLIST1, "DerivedData/", "DerivedPropData1" );
// AGE stuff
//UCD ucd = UCD.make();
//System.out.println(ucd.getAgeID(0x61));
//System.out.println(ucd.getAgeID(0x2FA1D));
//
}
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.6 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -53,6 +53,7 @@ final class MyPropertyLister extends PropertyLister {
}
public String valueName(int cp) {
if (up.getValueType() == BINARY) return up.getName();
return up.getValue(cp);
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2001/12/03 19:29:35 $
* $Revision: 1.5 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -49,9 +49,9 @@ public final class Normalizer implements UCD_Types {
/**
* Create a normalizer for a given form.
*/
public Normalizer(byte form) {
this(form,"");
}
// public Normalizer(byte form) {
// this(form,"");
//}
/**
* Return string name

View file

@ -26,13 +26,12 @@
#
# NOTE: The property value names are NOT unique across properties, especially
# with loose matches. For example,
#
# AL means Arabic Letter for the Bidi_Class property, and
# AL means Alpha_Left for the Combining_Class property, and
# AL means Alphabetic for the Line_Break property.
#
# In addition, some property names may be the same as some property value names:
# cc means Combining_Class property, and
# cc means the General_Category property value Control (cc)
# In addition, some property names may be the same as some property value names.
#
# The combination of property value and property name is, however, unique.
# For more information, see UTR #24: Regular Expression Guidelines

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.6 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -168,13 +168,20 @@ abstract public class PropertyLister implements UCD_Types {
return lastSpace;
}
private static final byte FAKERC = 63; // fake category for comparison
private static final byte FAKELC = 63; // fake category for comparison
private static final byte FAKENC = 64; // fake category for comparison
private byte getModCat(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lt || cat == Ll || cat == Lu) cat = FAKELC;
if (cat == Cn && ucdData.isNoncharacter(cp)) cat = FAKENC;
if (cat == UNASSIGNED && ucdData.isNoncharacter(cp)) cat = FAKENC;
else if (breakByCategory) {
if (cat == Lt || cat == Ll || cat == Lu) cat = FAKELC;
} else {
// MASH almost everything together
if (cat != CONTROL && cat != FORMAT && cat != SURROGATE
&& cat != PRIVATE_USE && cat != UNASSIGNED) cat = FAKERC;
}
return cat;
}
@ -196,7 +203,7 @@ abstract public class PropertyLister implements UCD_Types {
byte s = status(cp);
if (alwaysBreaks && s == INCLUDE) s = BREAK;
if (s == INCLUDE && firstRealCp != -1) {
if (breakByCategory && getModCat(cp) != firstRealCpCat) s = BREAK;
if (getModCat(cp) != firstRealCpCat) s = BREAK;
}
switch(s) {

View file

@ -22,6 +22,9 @@
#
# Third Field: The third field is a long name.
#
# In the case of ccc, their are 4 fields. The second field is numeric, third
# is abbreviated, and fourth is long.
#
# With loose matching of property names, the case distinctions, whitespace,
# and '_' are ignored.
#

View file

@ -0,0 +1,67 @@
# ================================================================================
# Conditional mappings
# ================================================================================
# Special case for final form of sigma
03A3; 03C2; 03A3; 03A3; FINAL_SIGMA; # GREEK CAPITAL LETTER SIGMA
# Note: the following cases for non-final are already in the UnicodeData file.
# 03A3; 03C3; 03A3; 03A3; # GREEK CAPITAL LETTER SIGMA
# 03C3; 03C3; 03A3; 03A3; # GREEK SMALL LETTER SIGMA
# 03C2; 03C2; 03A3; 03A3; # GREEK SMALL LETTER FINAL SIGMA
# Note: the following cases are not included, since they would case-fold in lowercasing
# 03C3; 03C2; 03A3; 03A3; FINAL_SIGMA; # GREEK SMALL LETTER SIGMA
# 03C2; 03C3; 03A3; 03A3; NOT_FINAL_SIGMA; # GREEK SMALL LETTER FINAL SIGMA
# ================================================================================
# Locale-sensitive mappings
# ================================================================================
# Lithuanian
# Lithuanian retains the dot in a lowercase i when followed by accents.
# Remove DOT ABOVE after "i" with upper or titlecase
0307; 0307; ; ; lt AFTER_i # COMBINING DOT ABOVE
# Introduce an explicit dot above when lowercasing capital I's and J's
# whenever there are more accents above
# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
0049; 0069 0307; 0049; 0049; lt MORE_ABOVE # LATIN CAPITAL LETTER I
004A; 006A 0307; 004A; 004A; lt MORE_ABOVE # LATIN CAPITAL LETTER J
012E; 012F 0307; 012E; 012E; lt MORE_ABOVE # LATIN CAPITAL LETTER I WITH OGONEK
00CC; 0069 0307 0300; 00CC; 00CC; lt # LATIN CAPITAL LETTER I WITH GRAVE
00CD; 0069 0307 0301; 00CD; 00CD; lt # LATIN CAPITAL LETTER I WITH ACUTE
0128; 0069 0307 0303; 0128; 0128; lt # LATIN CAPITAL LETTER I WITH TILDE
# ================================================================================
# Turkish and Azeri
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
# The following rules handle those cases.
# When lowercasing, remove dot_ above in the sequence I + dot_ above, which will turn into i.
# This matches the behavior of the canonically equivalent I-dot_above
0307; ; 0307; 0307; AFTER_I # COMBINING DOT ABOVE
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
0049; 0131; 0049; 0049; tr NOT_BEFORE_DOT; # LATIN CAPITAL LETTER I
0049; 0131; 0049; 0049; az NOT_BEFORE_DOT; # LATIN CAPITAL LETTER I
# When uppercasing, i turns into a dotted capital I
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
# Note: the following cases are already in the UnicodeData file.
# 0131; 0131; 0049; 0049; tr; # LATIN SMALL LETTER DOTLESS I
# 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE

View file

@ -0,0 +1,60 @@
# SpecialCasing-6.txt
#
# Special Casing Properties
#
# This file is a supplement to the UnicodeData file.
# It contains additional information about the casing of Unicode characters.
# (For compatibility, the UnicodeData.txt file only contains case mappings for
# characters where they are 1-1, and does not have locale-specific mappings.)
# For more information, see
# UTR #21 Case Mappings, at http://www.unicode.org/unicode/reports/tr21/
#
# ================================================================================
# Format
# ================================================================================
# The entries in this file are in the following machine-readable format:
#
# <code>; <lower> ; <title> ; <upper> ; (<condition_list> ;)? # <comment>
#
# <code>, <lower>, <title>, and <upper> provide character values in hex. If there is more than
# one character, they are separated by spaces.
#
# The <condition_list> is optional. Where present, it consists of one or more locales or contexts,
# separated by spaces. In these conditions:
# - A condition list overrides the normal behavior if all of the listed conditions are true.
# - Case distinctions in the condition list are not significant.
# - Conditions preceded by "NOT_" represent the negation of the condition.
# - A cased letter is any character with general category = Ll or Lo or Lt
# - An ignorable sequence is a sequence of *zero* or more characters from
# the set {HYPHEN, SOFT HYPHEN, general category = Mn}.
#
# A locale is defined as:
# <locale> := <ISO_639_code> ( "_" <ISO_3166_code> ( "_" <variant> )? )?
# <ISO_3166_code> := 2-letter ISO country code,
# <ISO_639_code> := 2-letter ISO language code
#
# A context is a locale or one of the following choices:
# CFINAL: The character is not followed by a sequence consisting of
# an ignorable sequence and then a cased letter.
# CINITIAL: The character is not preceded by a sequence consisting of
# a cased letter and an ignorable sequence.
# FINAL_SIGMA: CFINAL and NOT_CINITIAL
# TYPE_i: The character is "i" (0069), "j" (006A),
# or has a canonical decomposition that begins with an "i" or "j"
# but has no combining characters above (i.e., i-ogonek (012F),
# i-tilde-below (1E2D), or i-dot-below (1ECB)).
# AFTER_i: The last preceding base character was TYPE_i, and
# no combining character class 230 (above) has intervened.
# MORE_ABOVE: The character is followed by one or more characters of
# combining class 230 (ABOVE) in the combining character sequence
#
# Other than as used to separate elements, spaces are to be ignored.
#
# Parsers of this file must be prepared to deal future additions to this format:
# * Additional contexts
# * Additional fields
# ================================================================================
# ================================================================================
# Unconditional mappings
# ================================================================================

View file

@ -0,0 +1,13 @@
# IMPORTANT-when capitalizing iota-subscript (0345)
# It MUST be in normalized form--moved to the end of any sequence of combining marks.
# This is because logically it represents a following base character!
# E.g. <iota_subscript> (<Mn> | <Mc> | <Me>)+ => (<Mn> | <Mc> | <Me>)+ <iota_subscript>
# It should never be the first character in a word, so in titlecasing it can be left as is.
# The following cases are already in the UnicodeData file, so are only commented here.
# 0345; 0345; 0345; 0399; # COMBINING GREEK YPOGEGRAMMENI
# All letters with YPOGEGRAMMENI (iota-subscript) or PROSGEGRAMMENI (iota adscript)
# have special uppercases.
# Note: characters with PROSGEGRAMMENI are actually titlecase, not uppercase!

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.7 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -21,6 +21,7 @@ import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
public class TestData implements UCD_Types {
/*
public static void main (String[] args) throws IOException {
System.out.println("START");
@ -200,7 +201,6 @@ public class TestData implements UCD_Types {
}
output.close();
}
*/
public static void generateCompExclusions() throws IOException {
PrintWriter output = Utility.openPrintWriter("CompositionExclusionsDelta.txt");
@ -246,7 +246,7 @@ public class TestData implements UCD_Types {
System.out.println(ucd.getData(0x100000-3));
if (true) return;
String test2 = ucd.getName(0x2A6D6);
//*/
//* /
PrintWriter output = Utility.openPrintWriter(file);
@ -485,5 +485,5 @@ public class TestData implements UCD_Types {
"E\u0304\u0300",
"E\u0300\u0304",
};
//*/
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestNormalization.java,v $
* $Date: 2001/08/31 00:30:17 $
* $Revision: 1.2 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -25,12 +25,6 @@ public final class TestNormalization {
static PrintWriter out = null;
static BufferedReader in = null;
static Normalizer nfc;
static Normalizer nfd;
static Normalizer nfkc;
static Normalizer nfkd;
static UCD ucd;
static BitSet charsListed = new BitSet(0x110000);
static int errorCount = 0;
static int lineErrorCount = 0;
@ -39,18 +33,14 @@ public final class TestNormalization {
public static void main(String[] args) throws java.io.IOException {
System.out.println("Creating Normalizers");
ucd = UCD.make("");
Main.setUCD();
nfc = new Normalizer(Normalizer.NFC);
nfd = new Normalizer(Normalizer.NFD);
nfkc = new Normalizer(Normalizer.NFKC);
nfkd = new Normalizer(Normalizer.NFKD);
String x = UTF32.valueOf32(0x10000);
check("NFC", nfc, x);
check("NFD", nfd, x);
check("NFKC", nfkc, x);
check("NFKD", nfkd, x);
check("NFC", Main.nfc, x);
check("NFD", Main.nfd, x);
check("NFKC", Main.nfkc, x);
check("NFKD", Main.nfkd, x);
out = new PrintWriter(
@ -97,36 +87,36 @@ public final class TestNormalization {
}
// c2 == NFC(c1) == NFC(c2) == NFC(c3)
errorCount += check("NFCa", nfc, parts[1], parts[0]);
errorCount += check("NFCb", nfc, parts[1], parts[1]);
errorCount += check("NFCc", nfc, parts[1], parts[2]);
errorCount += check("NFCa", Main.nfc, parts[1], parts[0]);
errorCount += check("NFCb", Main.nfc, parts[1], parts[1]);
errorCount += check("NFCc", Main.nfc, parts[1], parts[2]);
// c4 == NFC(c4) == NFC(c5)
errorCount += check("NFCd", nfc, parts[3], parts[3]);
errorCount += check("NFCe", nfc, parts[3], parts[4]);
errorCount += check("NFCd", Main.nfc, parts[3], parts[3]);
errorCount += check("NFCe", Main.nfc, parts[3], parts[4]);
// c3 == NFD(c1) == NFD(c2) == NFD(c3)
errorCount += check("NFDa", nfd, parts[2], parts[0]);
errorCount += check("NFDb", nfd, parts[2], parts[1]);
errorCount += check("NFDc", nfd, parts[2], parts[2]);
errorCount += check("NFDa", Main.nfd, parts[2], parts[0]);
errorCount += check("NFDb", Main.nfd, parts[2], parts[1]);
errorCount += check("NFDc", Main.nfd, parts[2], parts[2]);
// c5 == NFD(c4) == NFD(c5)
errorCount += check("NFDd", nfd, parts[4], parts[3]);
errorCount += check("NFDe", nfd, parts[4], parts[4]);
errorCount += check("NFDd", Main.nfd, parts[4], parts[3]);
errorCount += check("NFDe", Main.nfd, parts[4], parts[4]);
// c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
errorCount += check("NFKCa", nfkc, parts[3], parts[0]);
errorCount += check("NFKCb", nfkc, parts[3], parts[1]);
errorCount += check("NFKCc", nfkc, parts[3], parts[2]);
errorCount += check("NFKCd", nfkc, parts[3], parts[3]);
errorCount += check("NFKCe", nfkc, parts[3], parts[4]);
errorCount += check("NFKCa", Main.nfkc, parts[3], parts[0]);
errorCount += check("NFKCb", Main.nfkc, parts[3], parts[1]);
errorCount += check("NFKCc", Main.nfkc, parts[3], parts[2]);
errorCount += check("NFKCd", Main.nfkc, parts[3], parts[3]);
errorCount += check("NFKCe", Main.nfkc, parts[3], parts[4]);
// c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
errorCount += check("NFKDa", nfkd, parts[4], parts[0]);
errorCount += check("NFKDb", nfkd, parts[4], parts[1]);
errorCount += check("NFKDc", nfkd, parts[4], parts[2]);
errorCount += check("NFKDd", nfkd, parts[4], parts[3]);
errorCount += check("NFKDe", nfkd, parts[4], parts[4]);
errorCount += check("NFKDa", Main.nfkd, parts[4], parts[0]);
errorCount += check("NFKDb", Main.nfkd, parts[4], parts[1]);
errorCount += check("NFKDc", Main.nfkd, parts[4], parts[2]);
errorCount += check("NFKDd", Main.nfkd, parts[4], parts[3]);
errorCount += check("NFKDe", Main.nfkd, parts[4], parts[4]);
}
System.out.println("Total errors in file: " + errorCount
+ ", lines: " + lineErrorCount);
@ -160,21 +150,21 @@ public final class TestNormalization {
}
String otherList = "";
if (!base.equals(other)) {
otherList = "(" + ucd.getCodeAndName(other) + ")";
otherList = "(" + Main.ucd.getCodeAndName(other) + ")";
}
out.println("DIFF " + type + ": "
+ ucd.getCodeAndName(base) + " != "
+ Main.ucd.getCodeAndName(base) + " != "
+ type
+ otherList
+ " == " + ucd.getCodeAndName(trans)
+ " == " + Main.ucd.getCodeAndName(trans)
+ temp
);
return 1;
}
} catch (Exception e) {
throw new ChainException("DIFF " + type + ": "
+ ucd.getCodeAndName(base) + " != "
+ type + "(" + ucd.getCodeAndName(other) + ")", new Object[]{}, e);
+ Main.ucd.getCodeAndName(base) + " != "
+ type + "(" + Main.ucd.getCodeAndName(other) + ")", new Object[]{}, e);
}
return 0;
}
@ -188,10 +178,10 @@ public final class TestNormalization {
if ((missing & 0xFFF) == 0) System.out.println("# " + Utility.hex(missing));
if (charsListed.get(missing)) continue;
String x = UTF32.valueOf32(missing);
errorCount += check("NFC", nfc, x);
errorCount += check("NFD", nfd, x);
errorCount += check("NFKC", nfkc, x);
errorCount += check("NFKD", nfkd, x);
errorCount += check("NFC", Main.nfc, x);
errorCount += check("NFD", Main.nfd, x);
errorCount += check("NFKC", Main.nfkc, x);
errorCount += check("NFKD", Main.nfkd, x);
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.8 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.9 $
*
*******************************************************************************
*/
@ -48,6 +48,7 @@ public final class UCD implements UCD_Types {
if (version.indexOf('.') < 0) throw new IllegalArgumentException("Version must be of form 3.1.1");
UCD result = (UCD)versionCache.get(version);
if (result == null) {
//System.out.println(Utility.getStack());
result = new UCD();
result.fillFromFile(version);
versionCache.put(version, result);
@ -569,7 +570,8 @@ public final class UCD implements UCD_Types {
}
static String getCombiningClassID_fromIndex (short index, byte style) {
if (style == NORMAL || style == NUMBER) return String.valueOf(index & 0xFF);
index &= 0xFF;
if (style == NORMAL || style == NUMBER) return String.valueOf(index);
String s = "Fixed";
switch (index) {
case 0: s = style < LONG ? "NR" : "NotReordered"; break;
@ -619,7 +621,7 @@ public final class UCD implements UCD_Types {
}
public static String getDecompositionTypeID_fromIndex(byte prop) {
return getDecompositionTypeID_fromIndex(NORMAL);
return getDecompositionTypeID_fromIndex(prop, NORMAL);
}
public static String getDecompositionTypeID_fromIndex(byte prop, byte style) {
return style == SHORT ? UCD_Names.SHORT_DT[prop] : UCD_Names.DT[prop];
@ -1069,7 +1071,7 @@ to guarantee identifier closure.
uData.joiningType = JT_T;
}
if (!didJoiningHack && uData.joiningType != old) {
System.out.println("HACK: Setting "
System.out.println("HACK " + foundVersion + ": Setting "
+ UCD_Names.LONG_JOINING_TYPE[uData.joiningType]
+ ": " + Utility.hex(cp) + " " + uData.name);
didJoiningHack = true;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.10 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.11 $
*
*******************************************************************************
*/
@ -119,6 +119,8 @@ final class UCD_Names implements UCD_Types {
"Unified_Ideograph",
"Other_Default_Ignorable_Code_Point",
"Deprecated",
"Soft_Dotted",
"Logical_Order_Exception",
};
static final String[] SHORT_BP = {
@ -151,6 +153,8 @@ final class UCD_Names implements UCD_Types {
"UIdeo",
"ODI",
"Dep",
"SD",
"LOE",
};
/*

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.7 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.8 $
*
*******************************************************************************
*/
@ -28,6 +28,11 @@ public interface UCD_Types {
DERIVED_NORMALIZATION = 4,
DERIVED_ALL = 6,
ALL = (byte)-1;
static final byte
NON_ENUMERATED = -1,
ENUMERATED = 0,
BINARY = 1;
/*
0 Code value in 4-digit hexadecimal format.
@ -180,7 +185,9 @@ public interface UCD_Types {
UnifiedIdeograph = 26,
Reserved_Cf_Code_Point = 27,
Deprecated = 28,
LIMIT_BINARY_PROPERTIES = 29;
Soft_Dotted = 29,
Logical_Order_Exception = 30,
LIMIT_BINARY_PROPERTIES = 31;
/*
static final int

View file

@ -1,11 +1,15 @@
package com.ibm.text.UCD;
import com.ibm.text.UnicodeSet;
import com.ibm.text.utility.*;
public abstract class UnicodeProperty implements UCD_Types {
protected UCD ucd;
protected boolean isStandard = true;
protected byte type = NOT_DERIVED;
private byte valueType = BINARY;
protected boolean hasUnassigned = false;
protected boolean valueVaries = false;
protected boolean isBinary = true;
protected byte defaultValueStyle = SHORT;
protected byte defaultPropertyStyle = LONG;
protected String valueName;
@ -29,11 +33,17 @@ public abstract class UnicodeProperty implements UCD_Types {
public void setStandard(boolean in) { isStandard = in; }
/**
* What type is it?
* What type is it? DERIVED..
*/
public byte getType() { return type; }
public void setType(byte in) { type = in; }
/**
* Does getProperty vary in contents? ENUMERATED,...
*/
public byte getValueType() { return valueType; }
public void setValueType(byte in) { valueType = in; }
/**
* Does it apply to any unassigned characters?
*/
@ -66,7 +76,7 @@ public abstract class UnicodeProperty implements UCD_Types {
public String getProperty(byte style) {
if (style == NORMAL) style = defaultPropertyStyle;
switch (style) {
case LONG: return name.toString();
case LONG: return Utility.getUnskeleton(name.toString(), false);
case SHORT: return shortName.toString();
case NUMBER: return numberName.toString();
default: throw new IllegalArgumentException("Bad property: " + style);
@ -78,7 +88,7 @@ public abstract class UnicodeProperty implements UCD_Types {
public void setProperty(byte style, String in) {
if (style == NORMAL) style = defaultPropertyStyle;
switch (style) {
case LONG: name = in; break;
case LONG: name = Utility.getUnskeleton(in, false); break;
case SHORT: shortName = in; break;
case NUMBER: numberName = in; break;
default: throw new IllegalArgumentException("Bad property: " + style);
@ -98,10 +108,10 @@ public abstract class UnicodeProperty implements UCD_Types {
public String getValue(int cp) { return getValue(cp, NORMAL); }
public void setValue(byte style, String in) {
if (valueVaries) throw new IllegalArgumentException("Can't set varying value: " + style);
if (getValueType() != BINARY) throw new IllegalArgumentException("Can't set varying value: " + style);
if (style == NORMAL) style = defaultValueStyle;
switch (style) {
case LONG: valueName = in; break;
case LONG: valueName = Utility.getUnskeleton(in, false); break;
case SHORT: shortValueName = in; break;
case NUMBER: numberValueName = in; break;
default: throw new IllegalArgumentException("Bad value: " + style);
@ -109,12 +119,12 @@ public abstract class UnicodeProperty implements UCD_Types {
}
public String getValue(byte style) {
if (valueVaries) throw new IllegalArgumentException(
if (getValueType() != BINARY) throw new IllegalArgumentException(
"Value varies in " + getName(LONG) + "; call getValue(cp)");
try {
if (style == NORMAL) style = defaultValueStyle;
switch (style) {
case LONG: return valueName.toString();
case LONG: return Utility.getUnskeleton(valueName.toString(), false);
case SHORT: return shortValueName.toString();
case NUMBER: return numberValueName.toString();
default: throw new IllegalArgumentException("Bad property: " + style);
@ -124,17 +134,27 @@ public abstract class UnicodeProperty implements UCD_Types {
}
}
/**
* Does getProperty vary in contents?
*/
public boolean valueVaries() { return valueVaries; }
public void setValueVaries(boolean in) { valueVaries = in; }
/**
* Does it have the propertyValue?
*/
abstract boolean hasValue(int cp);
/**
* Get the set of characters it contains
*/
private UnicodeSet cache = null;
public UnicodeSet getSet() {
if (cache == null) {
cache = new UnicodeSet();
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
if (hasValue(cp)) cache.add(cp);
}
}
return (UnicodeSet) cache.clone();
}
///////////////////////////////////////////
// Old Name for compatibility

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UnifiedBinaryProperty.java,v $
* $Date: 2001/12/06 00:05:53 $
* $Revision: 1.3 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -16,6 +16,7 @@ import java.io.*;
import java.util.*;
import com.ibm.text.utility.*;
import com.ibm.text.UnicodeSet;
final class UnifiedBinaryProperty extends UnicodeProperty {
int majorProp;
@ -30,6 +31,54 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
return getCached(propMask, ucd);
}
public static UnicodeProperty make(String propAndValue, UCD ucd) {
return make(getPropmask(propAndValue, ucd), ucd);
}
public static UnicodeSet getSet(int propMask, UCD ucd) {
UnicodeProperty up = make(propMask, ucd);
return up.getSet();
}
public static UnicodeSet getSet(String propAndValue, UCD ucd) {
return getSet(getPropmask(propAndValue, ucd), ucd);
}
private static Map propNameCache = null;
public static int getPropmask(String propAndValue, UCD ucd) {
// cache the names
if (propNameCache == null) {
System.out.println("Caching Property Names");
propNameCache = new HashMap();
for (int i = 0; i < LIMIT_ENUM; ++i) {
UnicodeProperty up = UnifiedBinaryProperty.make(i, ucd);
if (up == null) continue;
if (!up.isStandard()) continue;
if (up.getValueType() != BINARY) continue;
String shortValue = Utility.getSkeleton(up.getValue(SHORT));
String shortName = Utility.getSkeleton(up.getProperty(SHORT));
String longValue = Utility.getSkeleton(up.getValue(LONG));
String longName = Utility.getSkeleton(up.getProperty(LONG));
Integer result = new Integer(i);
propNameCache.put(longName + "=" + longValue, result);
propNameCache.put(longName + "=" + shortValue, result);
propNameCache.put(shortName + "=" + longValue, result);
propNameCache.put(shortName + "=" + shortValue, result);
}
System.out.println("Done Caching");
}
propAndValue = Utility.getSkeleton(propAndValue);
Integer indexObj = (Integer) propNameCache.get(propAndValue);
if (indexObj == null) {
throw new IllegalArgumentException("No property found for " + propAndValue);
}
return indexObj.intValue();
}
static Map cache = new HashMap();
static UCD lastUCD = null;
static int lastPropMask = -1;
@ -76,7 +125,16 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
shortValueName = _getValue(SHORT);
numberValueName = _getValue(NUMBER);
defaultValueStyle = _getDefaultStyle();
System.out.println("Value = " + getValue(defaultValueStyle));
if (majorProp == (BINARY_PROPERTIES>>8)) {
name = valueName;
shortName = shortValueName;
defaultPropertyStyle = defaultValueStyle;
valueName = "YES";
shortValueName = "Y";
}
// System.out.println("Value = " + getValue(defaultValueStyle));
// System.out.println(majorProp + ", " + propValue + ", " + name);
// dp = new DerivedProperty(ucd);
}
@ -247,9 +305,7 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
return UCD_Names.LONG_JOINING_TYPE[propValue];
case JOINING_GROUP>>8: if (propValue >= LIMIT_JOINING_GROUP) break;
return ucd.getJoiningGroupID_fromIndex((byte)propValue);
case BINARY_PROPERTIES>>8: if (propValue >= LIMIT_BINARY_PROPERTIES) break;
if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex((byte)propValue);
return UCD_Names.SHORT_BP[propValue];
case BINARY_PROPERTIES>>8: return ucd.getBinaryPropertiesID_fromIndex((byte)propValue, style);
case SCRIPT>>8: if (propValue >= LIMIT_SCRIPT) break;
if (style != SHORT) return ucd.getScriptID_fromIndex((byte)propValue);
return UCD_Names.ABB_SCRIPT[propValue];
@ -263,7 +319,7 @@ final class UnifiedBinaryProperty extends UnicodeProperty {
*/
}
} catch (RuntimeException e) {
throw new ChainException("Illegal property Number {0}, {1}", new Object[]{
throw new ChainException("Illegal property Number* {0}, {1}", new Object[]{
new Integer(majorProp), new Integer(propValue)}, e);
}
throw new ChainException("Illegal property Number {0}, {1}", new Object[]{

File diff suppressed because it is too large Load diff

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/Utility.java,v $
* $Date: 2001/12/06 00:05:52 $
* $Revision: 1.9 $
* $Date: 2001/12/13 23:35:57 $
* $Revision: 1.10 $
*
*******************************************************************************
*/
@ -16,6 +16,8 @@ package com.ibm.text.utility;
import java.util.*;
import java.text.*;
import java.io.*;
import com.ibm.text.UnicodeSet;
import com.ibm.text.UCD.*;
public final class Utility { // COMMON UTILITIES
@ -85,7 +87,65 @@ public final class Utility { // COMMON UTILITIES
}
return -1;
}
/**
* These routines use the Java functions, because they only need to act on ASCII.
* Removes space, _, and lowercases.
*/
public static String getSkeleton(String source) {
StringBuffer result = new StringBuffer();
boolean gotOne = false;
// remove spaces, '_'
// we can do this with char, since no surrogates are involved
for (int i = 0; i < source.length(); ++i) {
char ch = source.charAt(i);
if (ch == '_' || ch == ' ') {
gotOne = true;
} else {
char ch2 = Character.toLowerCase(ch);
if (ch2 != ch) {
gotOne = true;
result.append(ch2);
} else {
result.append(ch);
}
}
}
if (!gotOne) return source; // avoid string creation
return result.toString();
}
/**
* These routines use the Java functions, because they only need to act on ASCII
* Changes space, - into _, inserts _ between lower and UPPER.
*/
public static String getUnskeleton(String source, boolean titlecaseStart) {
StringBuffer result = new StringBuffer();
int lastCat = -1;
boolean haveFirstCased = true;
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
if (c == ' ' || c == '-') c = '_';
int cat = Character.getType(c);
if (lastCat == Character.LOWERCASE_LETTER && cat == Character.UPPERCASE_LETTER) {
result.append('_');
}
if (haveFirstCased && (cat == Character.LOWERCASE_LETTER
|| cat == Character.TITLECASE_LETTER || cat == Character.UPPERCASE_LETTER)) {
if (titlecaseStart) {
c = Character.toUpperCase(c);
}
haveFirstCased = false;
}
result.append(c);
lastCat = cat;
}
return result.toString();
}
public static String findSubstring(String source, Set target, boolean invert) {
Iterator it = target.iterator();
while (it.hasNext()) {
@ -178,6 +238,10 @@ public final class Utility { // COMMON UTILITIES
return result.toString();
}
/**
* Returns a string containing count copies of s.
* If count <= 0, returns "".
*/
public static String repeat(String s, int count) {
if (count <= 0) return "";
if (count == 1) return s;
@ -260,6 +324,10 @@ public final class Utility { // COMMON UTILITIES
return output.toString();
}
/**
* Splits a string containing divider into pieces, storing in output
* and returns the number of pieces.
*/
public static int split(String s, char divider, String[] output) {
int last = 0;
int current = 0;
@ -407,19 +475,22 @@ public final class Utility { // COMMON UTILITIES
return (aEnd - aStart) - (bEnd - bStart);
}
public static String join(int[] array, String sep) {
/**
* Joins an array together, using divider between the pieces
*/
public static String join(int[] array, String divider) {
String result = "{";
for (int i = 0; i < array.length; ++i) {
if (i != 0) result += sep;
if (i != 0) result += divider;
result += array[i];
}
return result + "}";
}
public static String join(long[] array, String sep) {
public static String join(long[] array, String divider) {
String result = "{";
for (int i = 0; i < array.length; ++i) {
if (i != 0) result += sep;
if (i != 0) result += divider;
result += array[i];
}
return result + "}";
@ -506,16 +577,18 @@ public final class Utility { // COMMON UTILITIES
}
public static BufferedReader openUnicodeFile(String filename, String version, boolean show) throws IOException {
String name = getMostRecentUnicodeDataFile(filename, version, show);
String name = getMostRecentUnicodeDataFile(filename, version, true, show);
if (name == null) return null;
return new BufferedReader(new FileReader(name),32*1024);
}
public static String getMostRecentUnicodeDataFile(String filename, String version, boolean show) throws IOException {
public static String getMostRecentUnicodeDataFile(String filename, String version,
boolean acceptLatest, boolean show) throws IOException {
// get all the files in the directory
int compValue = acceptLatest ? 0 : 1;
for (int i = 0; i < searchPath.length; ++i) {
if (version.length() != 0 && version.compareTo(searchPath[i]) < 0) continue;
if (version.length() != 0 && version.compareTo(searchPath[i]) < compValue) continue;
String directoryName = DATA_DIR + File.separator + searchPath[i] + "-Update" + File.separator;
if (show) System.out.println("Trying: '" + directoryName + "', '" + filename + "'");
@ -549,6 +622,9 @@ public final class Utility { // COMMON UTILITIES
log.println("</head><body>");
}
/**
* Replaces all occurances of piece with replacement, and returns new String
*/
public static String replace(String source, String piece, String replacement) {
while (true) {
int pos = source.indexOf(piece);
@ -556,4 +632,30 @@ public final class Utility { // COMMON UTILITIES
source = source.substring(0,pos) + source.substring(pos + piece.length());
}
}
public static String getStack() {
Exception e = new Exception();
StringWriter sw = new StringWriter();
PrintWriter pw = new PrintWriter(sw);
e.printStackTrace(pw);
pw.flush();
return "Showing Stack with fake " + sw.getBuffer().toString();
}
public static void showSetNames(String prefix, UnicodeSet set, boolean all, UCD ucd) {
int count = set.getRangeCount();
for (int i = 0; i < count; ++i) {
int start = set.getRangeStart(i);
int end = set.getRangeEnd(i);
if (all) {
for (int cp = start; cp <= end; ++cp) {
if (!set.contains(cp)) continue;
System.out.println(prefix + ucd.getCodeAndName(cp));
}
} else {
System.out.println(prefix + ucd.getCodeAndName(start) +
((start != end) ? (".." + ucd.getCodeAndName(end)) : ""));
}
}
}
}

View file

@ -3,7 +3,54 @@ WARNING!!
These directories contain some Unicode tools used to build various files,
and to check the consistency of the Unicode releases.
They are NOT production level code, and should never be used in programs.
The API is subject to change without notice, and will not be maintained.
The source is uncommented, and not well structured -- classic spaghetti style.
There is no build mechanism.
- They are NOT production level code, and should never be used in programs.
- The API is subject to change without notice, and will not be maintained.
- The source is uncommented, and not well structured -- classic spaghetti style.
- There is no build mechanism.
- I have not checked to make sure it works on Unix; probably the only change that
needs to be made is to fix the file separator.
Instructions:
1. You must edit UCD_Types at the top, to set the directories for the build:
public static final String DATA_DIR = "C:\\DATA\\";
public static final String BIN_DIR = DATA_DIR + "BIN\\";
public static final String GEN_DIR = DATA_DIR + "GEN\\";
Make sure that each of these directories exist. Also make sure that
<GEN_DIR>/DerivedData
<GEN_DIR>/DerivedData/ExtractedProperties
2. Download all of the UnicodeData files for each version into DATA_DIR
The folder names must be of the form: "3.2.0-Update"
3. For each version X (like 3.1.0), run
java version X build
This builds an compressed format of all the UCD data (except blocks and Unihan)
into the BIN directory. Don't worry about the voluminous console messages, unless one says
"FAIL".
4. To build all of the files for a particular version X, run
java version X all
To build a particular file, like CaseFolding, use that file name instead of all
java version X CaseFolding
To change the D version, edit the link in GenerateData.java:
static final int dVersion = 2; // change to fix the generated file D version. If less than zero, no "d"
5. To run basic consistency checking, run:
java version X verify
Don't worry about any console messages except those that say FAIL.