additional derived properties

X-SVN-Rev: 6434
This commit is contained in:
Mark Davis 2001-10-25 20:33:46 +00:00
parent 73c3b12dd4
commit 1dbf16c3c3
13 changed files with 797 additions and 351 deletions

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.4 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -14,55 +14,13 @@
package com.ibm.text.UCD;
import com.ibm.text.utility.*;
import com.ibm.text.*;
import java.util.*;
public class DerivedProperty implements UCD_Types {
UCD ucdData;
static final int
PropMath = 0,
PropAlphabetic = 1,
PropLowercase = 2,
PropUppercase = 3,
ID_Start = 4,
ID_Continue_NO_Cf = 5,
Mod_ID_Start = 6,
Mod_ID_Continue_NO_Cf = 7,
Missing_Uppercase = 8,
Missing_Lowercase = 9,
Missing_Mixedcase = 10,
FC_NFKC_Closure = 11,
FullCompExclusion = 12,
FullCompInclusion = 13,
QuickNFD = 14,
QuickNFC = 15,
QuickNFKD = 16,
QuickNFKC = 17,
ExpandsOnNFD = 18,
ExpandsOnNFC = 19,
ExpandsOnNFKD = 20,
ExpandsOnNFKC = 21,
GenNFD = 22,
GenNFC = 23,
GenNFKD = 24,
GenNFKC = 25,
DefaultIgnorable = 26,
GraphemeExtend = 27,
GraphemeBase = 28,
FC_NFC_Closure = 29,
LIMIT = 30;
// ADD CONSTANT to UCD_TYPES
public DerivedProperty(UCD ucd) {
ucdData = ucd;
@ -74,9 +32,9 @@ public class DerivedProperty implements UCD_Types {
else return "Unimplemented!!";
}
public String getName(int propNumber) {
public String getName(int propNumber, byte style) {
DProp dp = dprops[propNumber];
if (dp != null) return dp.getName();
if (dp != null) return dp.getName(style);
else return "Unimplemented!!";
}
@ -87,10 +45,17 @@ public class DerivedProperty implements UCD_Types {
}
public boolean isDefined(int propNumber) {
if (propNumber < 0 || propNumber >= dprops.length) return false;
return dprops[propNumber] != null;
}
public boolean isTest(int propNumber) {
if (!isDefined(propNumber)) return false;
return dprops[propNumber].isTest();
}
public boolean hasProperty(int cp, int propNumber) {
if (!isDefined(propNumber)) return false;
return dprops[propNumber].hasProperty(cp);
}
@ -112,9 +77,15 @@ public class DerivedProperty implements UCD_Types {
"Mixedcase"};
private abstract class DProp {
String name, header;
String getName() { return name; }
boolean testStatus = false;
byte defaultStyle = LONG;
String name, shortName, header;
String getName(byte style) {
if (style == NORMAL) style = defaultStyle;
return style < LONG ? shortName : name;
}
String getHeader() { return header; }
boolean isTest() { return testStatus; }
abstract boolean hasProperty(int cp);
public boolean propertyVaries() { return false; }
public String getProperty(int cp) { return hasProperty(cp) ? name : ""; }
@ -125,6 +96,7 @@ public class DerivedProperty implements UCD_Types {
ExDProp(int i) {
nfx = nf[i-ExpandsOnNFD];
name = "Expands_On_" + NAME[i-ExpandsOnNFD];
shortName = "XO_" + NAME[i-ExpandsOnNFD];
header = "# Derived Property: " + name
+ "\r\n# Generated according to UAX #15."
+ "\r\n# Characters whose normalized length is not one."
@ -139,11 +111,80 @@ public class DerivedProperty implements UCD_Types {
}
};
class NF_UnsafeStartProp extends DProp {
Normalizer nfx;
int prop;
NF_UnsafeStartProp(int i) {
prop = i-NFD_UnsafeStart;
nfx = nf[prop];
name = NAME[prop] + "_UnsafeStart";
shortName = NAME[prop] + "_SS";
header = "# Derived Property: " + name
+ "\r\n# Generated according to UAX #15."
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
;
}
boolean hasProperty(int cp) {
if (ucdData.getCombiningClass(cp) != 0) return false;
String norm = nfx.normalize(cp);
int first = UTF16.charAt(norm, 0);
if (ucdData.getCombiningClass(first) != 0) return true;
if ((prop == 1 || prop == 3)
&& dprops[NFC_TrailingZero].hasProperty(first)) return true; // 1,3 == composing
return false;
}
};
class NFC_Prop extends DProp {
BitSet bitset;
boolean filter = false;
boolean keepNonZero = true;
NFC_Prop(int i) {
BitSet[] bitsets = new BitSet[3];
switch(i) {
case NFC_Leading: bitsets[0] = bitset = new BitSet(); break;
case NFC_Resulting: bitsets[2] = bitset = new BitSet(); break;
case NFC_TrailingZero: keepNonZero = false; // FALL THRU
case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
}
filter = bitsets[1] != null;
nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
name = Names[i-NFC_Leading];
shortName = SNames[i-NFC_Leading];
header = "# Derived Property: " + name
+ "\r\n# " + Description[i-NFC_Leading]
+ "\r\n# NFKC characters are the same, after subtracting the NFKD = NO values."
+ "\r\n# Generated according to UAX #15."
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
}
boolean hasProperty(int cp) {
boolean result = bitset.get(cp);
if (result && filter) {
result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero;
}
return result;
}
final String[] Names = {"NFC_Leading", "NFC_TrailingNonZero", "NFC_TrailingZero", "NFC_Resulting"};
final String[] SNames = {"NFC_L", "NFC_TNZ", "NFC_TZ", "NFC_R"};
final String[] Description = {
"Characters that can combine with following characters in NFC",
"Characters that can combine with previous characters in NFC, and have non-zero combining class",
"Characters that can combine with previous characters in NFC, and have zero combining class",
"Characters that can result from a combination of other characters in NFC",
};
};
class GenDProp extends DProp {
Normalizer nfx;
Normalizer nfComp = null;
GenDProp (int i) {
testStatus = true;
nfx = nf[i-GenNFD];
name = NAME[i-GenNFD];
String compName = "the character itself";
@ -201,6 +242,7 @@ public class DerivedProperty implements UCD_Types {
name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase];
header = "# Derived Property: " + name
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
testStatus = true;
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
@ -221,6 +263,7 @@ public class DerivedProperty implements UCD_Types {
NO = NAME[i-QuickNFD] + "_NO";
MAYBE = NAME[i-QuickNFD] + "_MAYBE";
name = NAME[i-QuickNFD] + "_QuickCheck";
shortName = NAME[i-QuickNFD] + "_QC";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing decomposibles"
+ ((i == QuickNFC || i == QuickNFKC)
@ -250,9 +293,18 @@ public class DerivedProperty implements UCD_Types {
dprops[i] = new GenDProp(i);
}
for (int i = NFC_Leading; i <= NFC_Resulting; ++i) {
dprops[i] = new NFC_Prop(i);
}
for (int i = NFD_UnsafeStart; i <= NFKC_UnsafeStart; ++i) {
dprops[i] = new NF_UnsafeStartProp(i);
}
dprops[ID_Start] = new DProp() {
{
name = "ID_Start";
shortName = "IDS";
header = "# Derived Property: " + name
+ "\r\n# Characters that can start an identifier."
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
@ -265,6 +317,7 @@ public class DerivedProperty implements UCD_Types {
dprops[ID_Continue_NO_Cf] = new DProp() {
{
name = "ID_Continue";
shortName = "IDC";
header = "# Derived Property: " + name
+ "\r\n# Characters that can continue an identifier."
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
@ -278,6 +331,7 @@ public class DerivedProperty implements UCD_Types {
dprops[Mod_ID_Start] = new DProp() {
{
name = "XID_Start";
shortName = "XIDS";
header = "# Derived Property: " + name
+ "\r\n# ID_Start modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
@ -292,6 +346,7 @@ public class DerivedProperty implements UCD_Types {
dprops[Mod_ID_Continue_NO_Cf] = new DProp() {
{
name = "XID_Continue";
shortName = "XIDC";
header = "# Derived Property: " + name
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
+ "\r\n# Modified as described in UAX #15"
@ -307,6 +362,7 @@ public class DerivedProperty implements UCD_Types {
dprops[PropMath] = new DProp() {
{
name = "Math";
shortName = name;
header = "# Derived Property: " + name
+ "\r\n# Generated from: Sm + Other_Math";
}
@ -321,6 +377,7 @@ public class DerivedProperty implements UCD_Types {
dprops[PropAlphabetic] = new DProp() {
{
name = "Alphabetic";
shortName = "Alpha";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
}
@ -335,6 +392,7 @@ public class DerivedProperty implements UCD_Types {
dprops[PropLowercase] = new DProp() {
{
name = "Lowercase";
shortName = "Lower";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Ll + Other_Lowercase";
}
@ -349,6 +407,7 @@ public class DerivedProperty implements UCD_Types {
dprops[PropUppercase] = new DProp() {
{
name = "Uppercase";
shortName = "Upper";
header = "# Derived Property: " + name
+ "\r\n# Generated from: Lu + Other_Uppercase";
}
@ -373,7 +432,9 @@ of characters, the first of which has a non-zero combining class.
*/
dprops[FullCompExclusion] = new DProp() {
{
name = "Comp_Ex";
name = "Full_Composition_Exclusion";
shortName = "Comp_Ex";
defaultStyle = SHORT;
header = "# Derived Property: " + name
+ ": Full Composition Exclusion"
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
@ -390,7 +451,10 @@ of characters, the first of which has a non-zero combining class.
dprops[FullCompInclusion] = new DProp() {
{
name = "Comp_In";
name = "Full_Composition_Inclusion";
shortName = "Comp_In";
defaultStyle = SHORT;
testStatus = true;
header = "# Derived Property: " + name
+ ": Full Composition Inclusion"
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
@ -408,6 +472,7 @@ of characters, the first of which has a non-zero combining class.
dprops[FC_NFKC_Closure] = new DProp() {
{
name = "FC_NFKC_Closure";
shortName = "FC_NFKC";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));"
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
@ -427,6 +492,7 @@ of characters, the first of which has a non-zero combining class.
dprops[FC_NFC_Closure] = new DProp() {
{
name = "FC_NFC_Closure";
shortName = "FC_NFC";
header = "# Derived Property: " + name
+ "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));"
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
@ -450,8 +516,9 @@ of characters, the first of which has a non-zero combining class.
dprops[DefaultIgnorable] = new DProp() {
{
name = "Default_Ignorable_Code_Point";
shortName = "DI";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - WhiteSpace";
+ "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - White_Space";
}
boolean hasProperty(int cp) {
if (ucdData.getBinaryProperty(cp, White_space)) return false;
@ -471,11 +538,12 @@ of characters, the first of which has a non-zero combining class.
*/
dprops[GraphemeExtend] = new DProp() {
{
name = "GraphemeExtend";
name = "Grapheme_Extend";
shortName = "GrExt";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink"
+ "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link"
+ "\r\n# Used in the definition of GraphemeCluster: "
+ "\r\n# GraphemeCluster ::= GraphameBase? ( GraphemeExtend | GraphemeLink Join_Control? GraphemeBase? )*";
+ "\r\n# GraphemeCluster ::= GraphameBase? ( Grapheme_Extend | Grapheme_Link Join_Control? Grapheme_Base? )*";
}
boolean hasProperty(int cp) {
if (ucdData.getBinaryProperty(cp, GraphemeExtend)) return false;
@ -486,13 +554,80 @@ of characters, the first of which has a non-zero combining class.
}
};
dprops[Other_Case_Ignorable] = new DProp() {
{
name = "Other_Case_Ignorable";
shortName = "OCI";
header = header = "# Binary Property";
}
boolean hasProperty(int cp) {
switch(cp) {
case 0x27: case 0x2019: case 0xAD: return true;
// case 0x2d: case 0x2010: case 0x2011:
/*
0027 ; Other_Case_Ignorable # Po APOSTROPHE
00AD ; Other_Case_Ignorable # Pd SOFT HYPHEN
2019 ; Other_Case_Ignorable # Pf RIGHT SINGLE QUOTATION MARK
*/
}
return false;
}
};
dprops[Type_i] = new DProp() {
{
name = "Special_Dotted";
shortName = "SDot";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: all characters whose canonical decompositions end with a combining character sequence that"
+ "\r\n# - starts with i or j"
+ "\r\n# - has no combining marks above"
+ "\r\n# - has no combining marks with zero canonical combining class"
;
}
boolean hasProperty(int cp) {
if (cp == 'i' || cp == 'j') return true;
if (!nfkd.hasDecomposition(cp)) return false;
String decomp = nfd.normalize(cp);
boolean ok = false;
for (int i = decomp.length()-1; i >= 0; --i) {
char ch = decomp.charAt(i);
int cc = ucdData.getCombiningClass(ch);
if (cc == 230) return false;
if (cc == 0) {
if (ch == 'i' || ch == 'j') ok = true;
else return false;
}
}
return ok;
}
};
dprops[Case_Ignorable] = new DProp() {
{
name = "Case_Ignorable";
shortName = "CI";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);
if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true;
if (dprops[Other_Case_Ignorable].hasProperty(cp)) return true;
return false;
}
};
dprops[GraphemeBase] = new DProp() {
{
name = "GraphemeBase";
name = "Grapheme_Base";
shortName = "GrBase";
header = header = "# Derived Property: " + name
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - GraphemeLink - GraphemeExtend"
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Link - Grapheme_Extend"
+ "\r\n# Used in the definition of GraphemeCluster: "
+ "\r\n# GraphemeCluster ::= GraphameBase? ( GraphemeExtend | GraphemeLink Join_Control? GraphemeBase? )*";
+ "\r\n# GraphemeCluster ::= GraphameBase? ( Grapheme_Extend | Grapheme_Link Join_Control? Grapheme_Base? )*";
}
boolean hasProperty(int cp) {
byte cat = ucdData.getCategory(cp);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -25,7 +25,7 @@ class DiffPropertyLister extends PropertyLister {
}
public String propertyName(int cp) {
return ucdData.getVersion();
return major_minor_only(ucdData.getVersion());
}
/*
@ -49,9 +49,10 @@ class DiffPropertyLister extends PropertyLister {
public String headerString() {
if (oldUCD != null) {
return "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion();
return "# Differences between " + major_minor_only(ucdData.getVersion())
+ " and " + major_minor_only(oldUCD.getVersion());
} else {
return "# Allocated as of " + ucdData.getVersion();
return "# Designated as of " + major_minor_only(ucdData.getVersion());
}
}
@ -80,6 +81,10 @@ class DiffPropertyLister extends PropertyLister {
return count;
}
*/
private String major_minor_only(String s) {
return s.substring(0, s.lastIndexOf('.'));
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.6 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -19,19 +19,25 @@ import java.text.DateFormat;
import java.text.SimpleDateFormat;
import com.ibm.text.utility.*;
import com.ibm.text.UTF16;
public class GenerateData implements UCD_Types {
static UnifiedBinaryProperty ubp;
public static void main (String inVersion, String[] args) throws IOException {
System.out.println("START");
ucd = UCD.make(inVersion);
ubp = new UnifiedBinaryProperty(ucd);
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
String version = ucd.getVersion();
for (int i = 0; i < args.length; ++i) {
String arg = args[i];
if (arg.charAt(0) == '#') return; // skip rest of line
int mask = 0;
long mask = 0;
Utility.fixDot();
System.out.println("Argument: " + args[i]);
@ -39,7 +45,16 @@ public class GenerateData implements UCD_Types {
if (arg.equalsIgnoreCase("partition")) {
partitionProperties();
} else if (arg.equalsIgnoreCase("list")) {
listProperties();
listProperties();
} else if (arg.equalsIgnoreCase("listAccents")) {
listCombiningAccents();
} else if (arg.equalsIgnoreCase("listGreekVowels")) {
listGreekVowels();
} else if (arg.equalsIgnoreCase("listKatakana")) {
listKatakana();
} else if (arg.equalsIgnoreCase("diff")) {
listDifferences();
} else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
@ -91,6 +106,18 @@ public class GenerateData implements UCD_Types {
mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.FC_NFC_Closure-1);
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-" + version );
} else if (arg.equalsIgnoreCase("caseignorable")) {
mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i);
generateDerived(mask, HEADER_DERIVED, "CaseIgnorable-" + version );
} else if (arg.equalsIgnoreCase("nfcprops")) {
mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
generateDerived(mask, HEADER_DERIVED, "NFKC_SafeStart-" + version);
} else if (arg.equalsIgnoreCase("nfunsafestart")) {
mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart);
generateDerived(mask, HEADER_DERIVED, "NFUnsafeStart-" + version);
} else if (arg.equalsIgnoreCase("DerivedAge")) {
generateAge("DerivedAge-" + version );
@ -202,11 +229,11 @@ public class GenerateData implements UCD_Types {
output.println();
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
public static void generateDerived (long bitMask, int headerChoice, String fileName) throws IOException {
PrintWriter output = Utility.openPrintWriter(fileName + "dX.txt");
doHeader(fileName, output, headerChoice);
for (int i = 0; i < DerivedProperty.LIMIT; ++i) {
if ((bitMask & (1<<i)) == 0) continue;
if ((bitMask & (1L<<i)) == 0) continue;
System.out.print('.');
output.println("# ================================================");
output.println();
@ -277,7 +304,7 @@ public class GenerateData implements UCD_Types {
for (int i = 1; i < LIMIT_ENUM; ++i) { // || iType == SCRIPT
int iType = i & 0xFF00;
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
if (!ubp.isDefined(i)) continue;
props[count++] = i;
}
System.out.println("props: " + count);
@ -292,7 +319,7 @@ public class GenerateData implements UCD_Types {
if (!ucd.isAllocated(cp)) continue;
for (int i = 0; i < count; ++i) {
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, props[i]);
boolean iProp = ubp.get(cp, props[i]);
if (iProp) probe.set(i); else probe.clear(i);
}
@ -315,9 +342,9 @@ public class GenerateData implements UCD_Types {
for (int i = 1; i < LIMIT_ENUM; ++i) {
int iType = i & 0xFF00;
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS || iType == SCRIPT) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
String iNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
String iNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
if (!ubp.isDefined(i)) continue;
String iNameShort = ubp.getFullID(i, SHORT);
String iNameLong = ubp.getFullID(i, LONG);
System.out.println();
System.out.println();
@ -329,7 +356,7 @@ public class GenerateData implements UCD_Types {
int jType = j & 0xFF00;
if (jType == JOINING_GROUP || jType == AGE || jType == COMBINING_CLASS || jType == SCRIPT
|| (jType == iType && jType != BINARY_PROPERTIES)) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, j)) continue;
if (!ubp.isDefined(j)) continue;
if ((j >> 8) != last) {
last = j >> 8;
@ -349,8 +376,8 @@ public class GenerateData implements UCD_Types {
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
if (!ucd.isAllocated(cp)) continue;
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, i);
boolean jProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, j);
boolean iProp = ubp.get(cp, i);
boolean jProp = ubp.get(cp, j);
if (jProp) ++jCount;
if (iProp) {
@ -361,8 +388,8 @@ public class GenerateData implements UCD_Types {
}
if (iCount == 0 || jCount == 0) continue;
String jNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.SHORT);
//String jNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.LONG);
String jNameShort = ubp.getFullID(j, SHORT);
//String jNameLong = ubp.getFullID(j, LONG);
String rel = bothCount == 0 ? "DISJOINT"
: i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS"
@ -384,26 +411,69 @@ public class GenerateData implements UCD_Types {
}
public static void listProperties() {
public static void listProperties() throws IOException {
String propAbb = "";
String prop = "";
Map duplicates = new TreeMap();
Set sorted = new TreeSet(java.text.Collator.getInstance());
String spacing;
for(int k = 0; k < UCD_Names.NON_ENUMERATED.length; ++k) {
propAbb = UCD_Names.NON_ENUMERATED[k][0];
prop = UCD_Names.NON_ENUMERATED[k][1];
spacing = Utility.repeat(" ", 10-propAbb.length());
sorted.add("AA; " + propAbb + spacing + "; " + prop);
checkDuplicate(duplicates, propAbb, prop);
if (!prop.equals(propAbb)) checkDuplicate(duplicates, prop, prop);
}
sorted.add("xx; T ; True");
checkDuplicate(duplicates, "T", "xx");
sorted.add("xx; F ; False");
checkDuplicate(duplicates, "F", "xx");
sorted.add("qc; Y ; Yes");
checkDuplicate(duplicates, "Y", "qc");
sorted.add("qc; N ; No");
checkDuplicate(duplicates, "Y", "qc");
sorted.add("qc; M ; Maybe");
checkDuplicate(duplicates, "Y", "qc");
for (int i = 0; i < LIMIT_ENUM; ++i) {
int type = i & 0xFF00;
if (type == JOINING_GROUP || type == AGE) continue;
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
String value = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
if (type == AGE) continue;
if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue;
if (type == i && type != BINARY_PROPERTIES && type != DERIVED) {
propAbb = ubp.getPropertyName(i, SHORT);
prop = ubp.getPropertyName(i, LONG);
spacing = Utility.repeat(" ", 10-propAbb.length());
sorted.add("BB; " + propAbb + spacing + "; " + prop);
checkDuplicate(duplicates, propAbb, prop);
if (!prop.equals(propAbb)) checkDuplicate(duplicates, prop, prop);
}
if (!ubp.isDefined(i)) continue;
if (ubp.isTest(i)) continue;
String value = ubp.getID(i, LONG);
if (value.length() == 0) value = "none";
else if (value.equals("<unused>")) continue;
String abbvalue = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
value = fixGaps(value);
if (type == SCRIPT) {
value = ucd.getCase(value, FULL, TITLE);
}
String abbvalue = ubp.getID(i, SHORT);
if (abbvalue.length() == 0) abbvalue = "no";
if (type == COMBINING_CLASS) {
value = MyPropertyLister.getCombiningName(i);
if (value.length() == 0) {
if ((i & 0xFF) == 0) value = "99";
else continue;
}
abbvalue = value;
if (value.startsWith("Fixed_")) { continue; }
}
/*
String elide = "";
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
+ abbvalue
@ -421,9 +491,78 @@ public class GenerateData implements UCD_Types {
+ value
+ "}";
System.out.println("<tr><td>" + elide + "</td><td>" + abb + "</td><td>" + norm + "</td></tr>");
*/
spacing = Utility.repeat(" ", 10-abbvalue.length());
if (type == BINARY_PROPERTIES || type == DERIVED) {
sorted.add("ZZ; " + abbvalue + spacing + "; " + value);
checkDuplicate(duplicates, value, value);
if (!value.equals(abbvalue)) checkDuplicate(duplicates, abbvalue, value);
continue;
}
sorted.add(propAbb + "; " + abbvalue + spacing + "; " + value);
checkDuplicate(duplicates, value, prop + "=" + value);
if (!value.equals(abbvalue)) checkDuplicate(duplicates, abbvalue, prop + "=" + value);
}
PrintWriter log = Utility.openPrintWriter("PropertyAliases.txt");
Utility.appendFile("PropertyAliasHeader.txt", log);
Utility.print(log, sorted, "\r\n", new MyBreaker());
log.close();
}
static class MyBreaker implements Utility.Breaker {
public String get(Object current, Object old) {
if (old == null) return "";
String c = current.toString();
String o = old.toString();
if (c.length() >= 2 && o.length() >= 0 && !c.substring(0,2).equals(o.substring(0,2))) {
return "\r\n";
}
return "";
}
}
static void checkDuplicate(Map m, String toCheck, String comment) {
String result = (String) m.get(toCheck);
if (result != null) {
System.out.println("Collision with " + toCheck);
System.out.println(" Between " + comment);
System.out.println(" And " + result);
} else {
m.put(skeleton(toCheck), comment);
}
}
static String fixGaps(String source) {
StringBuffer result = new StringBuffer();
byte lastCat = -1;
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
if (c == ' ' || c == '-') c = '_';
byte cat = ucd.getCategory(c);
if (lastCat == Ll && cat == Lu) {
result.append('_');
}
result.append(c);
lastCat = cat;
}
return result.toString();
}
static String skeleton(String source) {
StringBuffer result = new StringBuffer();
source = source.toLowerCase();
for (int i = 0; i < source.length(); ++i) {
char c = source.charAt(i);
if (c < 'a' || c > 'z') continue;
result.append(c);
}
return result.toString();
}
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial,
@ -445,7 +584,7 @@ public class GenerateData implements UCD_Types {
doHeader(file, output, headerChoice);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
if (!ubp.isDefined(i)) continue;
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|| i == (BINARY_PROPERTIES | Non_break)
|| i == (BINARY_PROPERTIES | CaseFoldTurkishI)
@ -689,14 +828,19 @@ public class GenerateData implements UCD_Types {
static final void generateAge(String filename) throws IOException {
PrintWriter log = Utility.openPrintWriter(filename + "dX.txt");
try {
log.println("# Derived file showing when various code points were allocated in Unicode");
log.println("# Derived file showing when various code points were designated in Unicode");
log.println("# author: M. Davis");
log.println("# generated: " + new Date());
log.println("# Notes:");
log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 1.1.0 listing.");
log.println("# - The term 'designated' means that a previously reserved code point was specified");
log.println("# to be a noncharacter or surrogate, or assigned as a character,");
log.println("# control or format code.");
log.println("# - Versions are only tracked from 1.1 onwards, since version 1.0");
log.println("# predated changes required by the ISO 10646 merger.");
log.println("# - The Hangul Syllables that were removed from 2.0 are not included in the 1.1 listing.");
log.println("# - The supplementary private use code points and the non-character code points");
log.println("# were allocated in version 2.0, but not specifically listed in the UCD");
log.println("# until versions 3.0.1 and 3.1.0 respectively.");
log.println("# were designated in version 2.0, but not specifically listed in the UCD");
log.println("# until versions 3.0 and 3.1 respectively.");
log.println("# ================================================");
log.println();
@ -713,6 +857,9 @@ public class GenerateData implements UCD_Types {
log.println("# ================================================");
log.println();
new DiffPropertyLister("3.0.0", "3.1.0", log).print();
log.println("# ================================================");
log.println();
new DiffPropertyLister("3.1.0", "3.2.0", log).print();
/*
printDiff("110", "200");
UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
@ -761,5 +908,133 @@ public class GenerateData implements UCD_Types {
}
public static void listCombiningAccents() throws IOException {
PrintWriter log = Utility.openPrintWriter("ListAccents.txt");
Normalizer nfd = new Normalizer(Normalizer.NFD);
Set set = new TreeSet();
Set set2 = new TreeSet();
for (int i = 0; i < 0x10FFFF; ++i) {
Utility.dot(i);
if (!ucd.isRepresented(i)) continue;
if (!nfd.hasDecomposition(i)) {
if (ucd.getScript(i) == LATIN_SCRIPT) {
int cp = i;
String hex = "u" + Utility.hex(cp, 4);
set.add("# yyy $x <> \\" + hex + " ; # " + ucd.getName(cp));
}
continue;
}
String decomp = nfd.normalize(i);
int j;
for (j = 0; j < decomp.length(); j += UTF16.getCharCount(i)) {
int cp = UTF16.charAt(decomp, j);
byte cat = ucd.getCategory(cp);
if (cat != Mn) continue;
String hex = "u" + Utility.hex(cp, 4);
set.add("# xxx $x <> \\" + hex + " ; # " + ucd.getName(cp));
}
}
Iterator it = set.iterator();
while (it.hasNext()) {
log.println(it.next());
}
log.close();
}
public static void listGreekVowels() throws IOException {
PrintWriter log = Utility.openPrintWriter("ListGreekVowels.txt");
Normalizer nfd = new Normalizer(Normalizer.NFD);
Normalizer nfc = new Normalizer(Normalizer.NFC);
Set set = new TreeSet();
Set set2 = new TreeSet();
String vowels = "\u03B1\u03B5\u03B7\u03B9\u03BF\u03C5\u03C9\u0391\u0395\u0397\u0399\u039F\u03A5\u03A9";
String diphthongEnd = "\u03B9\u03C5\u0399\u03A5";
String diphthongStart = "\u03B1\u03B5\u03B7\u03BF\u03C5\u0391\u0395\u0397\u039F\u03A5";
String etas = "\u03B7\u0397";
String iotas = "\u03B9\u0399";
for (char i = 0; i < 0xFFFF; ++i) {
Utility.dot(i);
if (!ucd.isRepresented(i)) continue;
if (ucd.getScript(i) != GREEK_SCRIPT) continue;
String decomp = nfd.normalize(i);
if (decomp.indexOf('\u0306') >= 0) continue; // skip breve
if (decomp.indexOf('\u0304') >= 0) continue; // skip macron
String comp = nfc.normalize(decomp);
if (!comp.equals(String.valueOf(i))) continue; // skip compats
char first = decomp.charAt(0);
if (vowels.indexOf(first) < 0) continue;
String h = "";
if (decomp.indexOf('\u0314') >= 0) h = "\uFFFF";
if (diphthongEnd.indexOf(first) >= 0) {
for (int j = 0; j < diphthongStart.length(); ++j) {
String v = diphthongStart.substring(j, j+1);
char vc = v.charAt(0);
if (ucd.getCategory(vc) == Ll && ucd.getCategory(first) == Lu) continue;
if (etas.indexOf(vc) >= 0 && iotas.indexOf(first) >= 0) continue;
set.add(new Pair(h + v + first, new Pair(v + decomp, v + i)));
}
}
set.add(new Pair(h+first, new Pair(decomp, String.valueOf(i))));
}
Iterator it = set.iterator();
Object last = "";
while (it.hasNext()) {
Pair p = (Pair) it.next();
if (!last.equals(p.first)) {
log.println();
last = p.first;
} else {
log.print(", ");
}
p = (Pair) p.second;
log.print(p.second);
}
log.close();
}
public static void listKatakana() throws IOException {
for (char i = 'a'; i <= 'z'; ++i) {
doKana(String.valueOf(i));
if (i == 'c') doKana("ch");
if (i == 's') doKana("sh");
if (i == 'd') {
doKana("dz");
doKana("dj");
}
}
System.out.println();
}
public static void doKana(String i) {
String vowels = "aeiou";
System.out.println();
System.out.print(i + " " + i + i);
System.out.println();
for (int j = 0; j < vowels.length(); ++j) {
char c = vowels.charAt(j);
System.out.print(" " + i + c);
}
System.out.println();
for (int j = 0; j < vowels.length(); ++j) {
char c = vowels.charAt(j);
System.out.print(" " + i + "y" + c);
}
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -37,6 +37,7 @@ public final class Main {
} else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main();
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
@ -49,8 +50,9 @@ public final class Main {
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();
else if (arg.equalsIgnoreCase("GenerateData")) GenerateData.main(ucdVersion, Utility.split(args[++i],','));
else if (arg.equalsIgnoreCase("Generate")) GenerateData.main(ucdVersion, Utility.split(args[++i],','));
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
else {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -21,64 +21,36 @@ final class MyPropertyLister extends PropertyLister {
static final boolean BRIDGE = false;
private int propMask;
UnifiedBinaryProperty ubp;
public MyPropertyLister(UCD ucd, int propMask, PrintWriter output) {
this.propMask = propMask;
this.output = output;
this.ucdData = ucd;
ubp = new UnifiedBinaryProperty(ucd);
if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat
}
static String getCombiningName (int propMask) {
String s = "";
switch (propMask & 0xFF) {
case 0: s = "NotReordered"; break;
case 1: s = "Overlay"; break;
case 7: s = "Nukta"; break;
case 8: s = "KanaVoicing"; break;
case 9: s = "Virama"; break;
case 202: s = "AttachedBelowLeft"; break;
case 204: s = "AttachedBelow"; break;
case 206: s = "AttachedBelowRight"; break;
case 208: s = "AttachedLeft"; break;
case 210: s = "AttachedRight"; break;
case 212: s = "AttachedAboveLeft"; break;
case 214: s = "AttachedAbove"; break;
case 216: s = "AttachedAboveRight"; break;
case 218: s = "BelowLeft"; break;
case 220: s = "Below"; break;
case 222: s = "BelowRight"; break;
case 224: s = "Left"; break;
case 226: s = "Right"; break;
case 228: s = "AboveLeft"; break;
case 230: s = "Above"; break;
case 232: s = "AboveRight"; break;
case 233: s = "DoubleBelow"; break;
case 234: s = "DoubleAbove"; break;
case 240: s = "IotaSubscript"; break;
}
return s;
}
public String headerString() {
int main = (propMask & 0xFF00);
if (main == COMBINING_CLASS) {
String s = getCombiningName(propMask);
if (s.length() == 0) s = "Other Combining Class";
String s = UCD.getCombiningID_fromIndex((short)(propMask & 0xFF), LONG);
if (s.startsWith("Fixed")) s = "Other Combining Class";
return "# " + s;
} else if (main == BINARY_PROPERTIES) {
return "";
} else if (main == JOINING_GROUP) {
return "";
} else {
String shortID = getUnifiedBinaryPropertyID(ucdData, propMask, SHORT);
String longID = getUnifiedBinaryPropertyID(ucdData, propMask, LONG);
String shortID = ubp.getID(propMask, SHORT);
String longID = ubp.getID(propMask, LONG);
return "# " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")");
}
}
public String propertyName(int cp) {
return getUnifiedBinaryPropertyID(propMask);
return ubp.getID(propMask);
}
public String optionalComment(int cp) {
@ -115,7 +87,7 @@ final class MyPropertyLister extends PropertyLister {
else return EXCLUDE;
}
boolean inSet = getUnifiedBinaryProperty(cp, propMask);
boolean inSet = ubp.get(cp, propMask);
/*
if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) {
if (propMask == (SCRIPT | LATIN_SCRIPT)) inSet = cp <= 0x1D6A3;
@ -133,151 +105,6 @@ final class MyPropertyLister extends PropertyLister {
return INCLUDE;
}
/**
* @return unified property number
*/
public static boolean isUnifiedBinaryPropertyDefined(UCD ucd, int propMask) {
int enum = propMask >> 8;
propMask &= 0xFF;
switch (enum) {
case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY;
case COMBINING_CLASS>>8: return ucd.isCombiningClassUsed((byte)propMask);
case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS;
case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE;
case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE;
case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH;
case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK;
case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE;
case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP;
case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES;
case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT;
case AGE>>8: return propMask < LIMIT_AGE;
default: return false;
}
}
public boolean getUnifiedBinaryProperty(int cp, int propMask) {
return getUnifiedBinaryProperty(ucdData, cp, propMask);
}
static public boolean getUnifiedBinaryProperty(UCD ucd, int cp, int propMask) {
int enum = propMask >> 8;
propMask &= 0xFF;
switch (enum) {
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
return ucd.getCategory(cp) == propMask;
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
return ucd.getCombiningClass(cp) == propMask;
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
return ucd.getBidiClass(cp) == propMask;
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
return ucd.getDecompositionType(cp) == propMask;
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
return ucd.getNumericType(cp) == propMask;
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
return ucd.getEastAsianWidth(cp) == propMask;
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
return ucd.getLineBreak(cp) == propMask;
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
return ucd.getJoiningType(cp) == propMask;
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
return ucd.getJoiningGroup(cp) == propMask;
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
return ucd.getBinaryProperty(cp, propMask);
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
return ucd.getScript(cp) == propMask;
case AGE>>8: if (propMask >= LIMIT_AGE) break;
return ucd.getAge(cp) == propMask;
}
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
}
static final int SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2;
public String getUnifiedBinaryPropertyID(int unifiedPropMask) {
return getUnifiedBinaryPropertyID(ucdData, unifiedPropMask, NORMAL);
}
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask) {
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
if (longOne.equals(shortOne)) return longOne;
return shortOne + "(" + longOne + ")";
}
public static String getFullUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
String pre = "";
if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
if (style < LONG) pre = preShort;
else if (style == LONG || preShort.equals(preLong)) pre = preLong;
else pre = preShort + "(" + preLong + ")";
}
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
if (shortOne.length() == 0) shortOne = "xx";
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
if (longOne.length() == 0) longOne = "none";
String post;
if (style < LONG) post = shortOne;
else if (style == LONG || shortOne.equals(longOne)) post = longOne;
else post = shortOne + "(" + longOne + ")";
if (pre.length() == 0) {
pre = post + "=";
post = "T";
}
return pre + post;
}
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
int enum = unifiedPropMask >> 8;
byte propMask = (byte)unifiedPropMask;
switch (enum) {
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
if (style != LONG) return ucd.getCategoryID_fromIndex(propMask);
return UCD_Names.LONG_GC[propMask];
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
String s = "";
if (style == LONG) {
s = getCombiningName(unifiedPropMask);
if (s.length() != 0) return s;
s = "fixed_";
}
return s + ucd.getCombiningClassID_fromIndex((short)(0xFF & propMask));
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask);
return UCD_Names.LONG_BC[propMask];
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask);
return UCD_Names.SHORT_DT[propMask];
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask);
return UCD_Names.SHORT_NT[propMask];
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask);
return UCD_Names.SHORT_EA[propMask];
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask);
return UCD_Names.LONG_LB[propMask];
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask);
return UCD_Names.LONG_JOINING_TYPE[propMask];
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
return ucd.getJoiningGroupID_fromIndex(propMask);
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask);
return UCD_Names.SHORT_BP[propMask];
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
if (style != SHORT) return ucd.getScriptID_fromIndex(propMask);
return UCD_Names.ABB_SCRIPT[propMask];
case AGE>>8: if (propMask >= LIMIT_AGE) break;
return ucd.getAgeID_fromIndex(propMask);
}
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
}
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
* $Date: 2001/09/06 01:29:48 $
* $Revision: 1.3 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -208,12 +208,27 @@ public final class Normalizer implements UCD_Types {
* pair is firstChar << 16 | secondChar.
* Will need to be fixed for surrogates.
*/
/*
public IntHashtable.IntEnumeration getComposition() {
return data.getComposition();
}
*/
public void getCompositionStatus(BitSet leading, BitSet trailing, BitSet resulting) {
Iterator it = data.compTable.keySet().iterator();
while (it.hasNext()) {
Long key = (Long)it.next();
Integer result = (Integer)data.compTable.get(key);
long keyLong = key.longValue();
if (leading != null) leading.set((int)(keyLong >>> 32));
if (trailing != null) trailing.set((int)keyLong);
if (resulting != null) resulting.set(result.intValue());
}
for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
if (trailing != null && UCD.isTrailingJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
}
if (leading != null) {
for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
if (UCD.isDoubleHangul(i)) leading.set(i); // set all two-Jamo syllables
}
}
}
public boolean isTrailing(int cp) {
return this.composition ? data.isTrailing(cp) : false;

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.4 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
@ -30,7 +30,7 @@ public class TestData implements UCD_Types {
checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
int mask = 0;
long mask = 0;
if (false) {
@ -166,7 +166,7 @@ public class TestData implements UCD_Types {
output.println();
}
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
public static void generateDerived (long bitMask, int headerChoice, String fileName) throws IOException {
ucd = UCD.make("3.1.0");
PrintWriter output = Utility.openPrintWriter(fileName);
doHeader(fileName, output, headerChoice);
@ -251,9 +251,11 @@ public class TestData implements UCD_Types {
PrintWriter output = Utility.openPrintWriter(file);
doHeader(file, output, headerChoice);
UnifiedBinaryProperty ubp = new UnifiedBinaryProperty(ucd);
int last = -1;
for (int i = startEnum; i < endEnum; ++i) {
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
if (!ubp.isDefined(i)) continue;
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|| i == (CATEGORY | UNUSED_CATEGORY)
|| i == (BINARY_PROPERTIES | Non_break)

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -296,24 +296,38 @@ public final class UCD implements UCD_Types {
}
throw new IllegalArgumentException("getCase: " + caseType + ", " + simpleVsFull);
}
static final char SHY = '\u00AD';
static final char APOSTROPHE = '\u2019';
public String getCase(String s, byte simpleVsFull, byte caseType, String condition) {
if (UTF32.length32(s) == 1) return getCase(UTF32.char32At(s, 0), simpleVsFull, caseType);
StringBuffer result = new StringBuffer();
int cp;
byte currentCaseType = caseType;
DerivedProperty dp = new DerivedProperty(this);
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
String mappedVersion = getCase(cp, simpleVsFull, currentCaseType, condition);
result.append(mappedVersion);
if (caseType == TITLE) {
// if letter is cased, change to lowercase, otherwise change to TITLE
if (caseType == TITLE) { // set the case type for the next character
// certain characters are ignored
if (cp == '-' || cp == SHY || cp == '\'' || cp == APOSTROPHE) continue;
byte cat = getCategory(cp);
if (cat == Mn || cat == Me || cat == Mc) {
// ignore!
} else if (cat == Lu || cat == Ll || cat == Lt
|| getBinaryProperty(cp, Other_Lowercase)
|| getBinaryProperty(cp, Other_Uppercase)) {
if (cat == Mn || cat == Me || cat == Cf || cat == Lm) continue;
if (dp.hasProperty(cp, DerivedProperty.DefaultIgnorable)) continue;
// if DefaultIgnorable is not supported, then
// check for (Cf + Cc + Cs) - White_Space
// if (cat == Cs && cp != 0x85 && (cp < 9 || cp > 0xD)) continue;
// if letter is cased, change next to lowercase, otherwise revert to TITLE
if (cat == Lu || cat == Ll || cat == Lt
|| getBinaryProperty(cp, Other_Lowercase) // skip if not supported
|| getBinaryProperty(cp, Other_Uppercase) // skip if not supported
) {
currentCaseType = LOWER;
} else {
currentCaseType = TITLE;
@ -528,6 +542,43 @@ public final class UCD implements UCD_Types {
public static String getCategoryID_fromIndex(byte prop) {
return UCD_Names.GC[prop];
}
public String getCombiningID(int codePoint, byte style) {
return getCombiningID_fromIndex(getCombiningClass(codePoint), style);
}
static String getCombiningID_fromIndex (short index, byte style) {
String s = "Fixed";
switch (index) {
case 0: s = style < LONG ? "NR" : "NotReordered"; break;
case 1: s = style < LONG ? "OV" : "Overlay"; break;
case 7: s = style < LONG ? "NK" : "Nukta"; break;
case 8: s = style < LONG ? "KV" : "KanaVoicing"; break;
case 9: s = style < LONG ? "VR" : "Virama"; break;
case 202: s = style < LONG ? "ATBL" : "AttachedBelowLeft"; break;
case 204: s = style < LONG ? "ATB" : "AttachedBelow"; break;
case 206: s = style < LONG ? "ATBR" : "AttachedBelowRight"; break;
case 208: s = style < LONG ? "ATL" : "AttachedLeft"; break;
case 210: s = style < LONG ? "ATR" : "AttachedRight"; break;
case 212: s = style < LONG ? "ATAL" : "AttachedAboveLeft"; break;
case 214: s = style < LONG ? "ATA" : "AttachedAbove"; break;
case 216: s = style < LONG ? "ATAR" : "AttachedAboveRight"; break;
case 218: s = style < LONG ? "BL" : "BelowLeft"; break;
case 220: s = style < LONG ? "B" : "Below"; break;
case 222: s = style < LONG ? "BR" : "BelowRight"; break;
case 224: s = style < LONG ? "L" : "Left"; break;
case 226: s = style < LONG ? "R" : "Right"; break;
case 228: s = style < LONG ? "AL" : "AboveLeft"; break;
case 230: s = style < LONG ? "A" : "Above"; break;
case 232: s = style < LONG ? "AR" : "AboveRight"; break;
case 233: s = style < LONG ? "DB" : "DoubleBelow"; break;
case 234: s = style < LONG ? "DB" : "DoubleAbove"; break;
case 240: s = style < LONG ? "IS" : "IotaSubscript"; break;
default: s += "_" + index;
}
return s;
}
public String getBidiClassID(int codePoint) {
return getBidiClassID_fromIndex(getBidiClass(codePoint));
@ -868,7 +919,7 @@ to guarantee identifier closure.
// Hangul constants
static final int
public static final int
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
LCount = 19, VCount = 21, TCount = 28,
NCount = VCount * TCount, // 588
@ -891,6 +942,14 @@ to guarantee identifier closure.
}
private static final char[] pair = new char[2];
static boolean isDoubleHangul(int s) {
int SIndex = s - SBase;
if (0 > SIndex || SIndex >= SCount) {
throw new IllegalArgumentException("Not a Hangul Syllable: " + s);
}
return (SIndex % TCount) == 0;
}
static String getHangulDecompositionPair(int ch) {
int SIndex = ch - SBase;
@ -923,6 +982,10 @@ to guarantee identifier closure.
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
}
static boolean isLeadingJamo(int cp) {
return (LBase <= cp && cp < LLimit);
}
private void fillFromFile(String version) {
DataInputStream dataIn = null;
String fileName = BIN_DIR + "UCD_Data" + version + ".bin";

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
* $Date: 2001/09/01 00:06:15 $
* $Revision: 1.3 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -17,6 +17,22 @@ import com.ibm.text.utility.*;
final class UCD_Names implements UCD_Types {
public static String[][] NON_ENUMERATED = {
{"na", "Name"},
{"dm", "Decomposition_Mapping"},
{"nv", "Numeric_Value"},
{"bmg", "Bidi_Mirroring_Glyph"},
{"lc", "Lowercase_Mapping"},
{"uc", "Uppercase_Mapping"},
{"tc", "Titlecase_Mapping"},
{"cf", "Case_Folding"},
{"slc", "Simple_Lowercase_Mapping"},
{"suc", "Simple_Uppercase_Mapping"},
{"stc", "Simple_Titlecase_Mapping"},
{"sfc", "Simple_Case_Folding"},
{"scc", "Special_Case_Condition"}
};
static final String[] UNIFIED_PROPERTIES = {
"General Category (listing UnicodeData.txt, field 2: see UnicodeData.html)",
@ -32,7 +48,8 @@ final class UCD_Names implements UCD_Types {
"Joining Group (listing ArabicShaping.txt, field 2)",
"BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)",
"Script",
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)"
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)",
"Derived"
};
static final String[] SHORT_UNIFIED_PROPERTIES = {
@ -45,9 +62,10 @@ final class UCD_Names implements UCD_Types {
"LineBreak",
"JoiningType",
"JoiningGroup",
"Value",
"",
"Script",
"Age"
"Age",
""
};
static final String[] ABB_UNIFIED_PROPERTIES = {
@ -60,15 +78,16 @@ final class UCD_Names implements UCD_Types {
"lb",
"jt",
"jg",
"va",
"",
"sc",
"Ag"
"ag",
"",
};
static final String[] BP = {
"BidiMirrored",
"CompositionExclusion",
"Bidi_Mirrored",
"Composition_Exclusion",
"White_Space",
"NonBreak",
"Bidi_Control",
@ -87,46 +106,46 @@ final class UCD_Names implements UCD_Types {
"Other_Lowercase",
"Other_Uppercase",
"Noncharacter_Code_Point",
"CaseFoldTurkishI",
"Other_GraphemeExtend",
"GraphemeLink",
"IDS_BinaryOperator",
"IDS_TrinaryOperator",
"Case_Fold_Turkish_I",
"Other_Grapheme_Extend",
"Grapheme_Link",
"IDS_Binary_Operator",
"IDS_Trinary_Operator",
"Radical",
"UnifiedIdeograph",
"Unified_Ideograph",
"Other_Default_Ignorable_Code_Point",
"Deprecated",
};
static final String[] SHORT_BP = {
"BidiM",
"CExc",
"WhSp",
"CE",
"WSpace",
"NBrk",
"BdCon",
"JCon",
"BidiC",
"JoinC",
"Dash",
"Hyph",
"Hyphen",
"QMark",
"TPunc",
"Term",
"OMath",
"HexD",
"AHexD",
"OAlph",
"Hex",
"AHex",
"OAlpha",
"Ideo",
"Diac",
"Dia",
"Ext",
"OLoc",
"OUpc",
"OLower",
"OUpper",
"NChar",
"TurkI",
"OGrX",
"OGrExt",
"GrLink",
"IDSB",
"IDST",
"Radical",
"UCJK",
"RCf",
"UIdeo",
"ODI",
"Dep",
};
@ -196,7 +215,7 @@ final class UCD_Names implements UCD_Types {
"Numeric", "Alphabetic", "Ideographic", "Inseperable", "Hyphen",
"CombiningMark", "BreakBefore", "BreakAfter", "Space",
"MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak",
"ComplexContext", "Ambiguous", "BreakBeforeAndAfter", "Surrogate", "ZWSpace"
"ComplexContext", "Ambiguous", "BreakBoth", "Surrogate", "ZWSpace"
};
public static final String[] SCRIPT = {

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
* $Date: 2001/09/19 23:33:16 $
* $Revision: 1.3 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -37,6 +37,10 @@ public interface UCD_Types {
13 Lower case equivalent mapping. Similar to 12. This field is informative.
14 Title case equivalent mapping. Similar to 12. This field is informative.
*/
// for IDs
static final byte SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2;
// Binary ENUM Grouping
public static final int
@ -52,8 +56,9 @@ public interface UCD_Types {
BINARY_PROPERTIES = 0x900,
SCRIPT = 0xA00,
AGE = 0xB00,
DERIVED = 0xC00,
NEXT_ENUM = 0x100,
LIMIT_ENUM = AGE + 0x100;
LIMIT_ENUM = DERIVED + 0x100;
public static final int LIMIT_COMBINING_CLASS = 256;
@ -384,4 +389,65 @@ public static byte
YUDH_HE = 48,
ZAIN = 49,
LIMIT_JOINING_GROUP = 50;
// DERIVED PROPERTY
static final int
PropMath = 0,
PropAlphabetic = 1,
PropLowercase = 2,
PropUppercase = 3,
ID_Start = 4,
ID_Continue_NO_Cf = 5,
Mod_ID_Start = 6,
Mod_ID_Continue_NO_Cf = 7,
Missing_Uppercase = 8,
Missing_Lowercase = 9,
Missing_Mixedcase = 10,
FC_NFKC_Closure = 11,
FullCompExclusion = 12,
FullCompInclusion = 13,
QuickNFD = 14,
QuickNFC = 15,
QuickNFKD = 16,
QuickNFKC = 17,
ExpandsOnNFD = 18,
ExpandsOnNFC = 19,
ExpandsOnNFKD = 20,
ExpandsOnNFKC = 21,
GenNFD = 22,
GenNFC = 23,
GenNFKD = 24,
GenNFKC = 25,
DefaultIgnorable = 26,
GraphemeExtend = 27,
GraphemeBase = 28,
FC_NFC_Closure = 29,
Other_Case_Ignorable = 30,
Case_Ignorable = 31,
Type_i = 32,
NFC_Leading = 33,
NFC_TrailingNonZero = 34,
NFC_TrailingZero = 35,
NFC_Resulting = 36,
NFD_UnsafeStart = 37,
NFC_UnsafeStart = 38,
NFKD_UnsafeStart = 39,
NFKC_UnsafeStart = 40,
LIMIT = 41;
}

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
* $Date: 2001/08/31 00:29:50 $
* $Revision: 1.2 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -185,7 +185,7 @@ class UData implements UCD_Types {
result.append(" n='").append(Utility.quoteXML(name)).append("'\r\n");
int lastPos = result.length();
if (full || generalCategory != Lo) result.append(" gc='").append(UCD_Names.GC[generalCategory]).append('\'');
if (full || combiningClass != 0) result.append(" cc='").append(combiningClass & 0xFF).append('\'');
if (full || decompositionType != NONE) result.append(" dt='").append(UCD_Names.DT[decompositionType]).append('\'');
@ -232,7 +232,7 @@ class UData implements UCD_Types {
result.append("/>");
return result.toString();
}
public void writeBytes(DataOutputStream os) throws IOException {
compact();
os.writeInt(codePoint);

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
* $Date: 2001/09/19 23:33:15 $
* $Revision: 1.5 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.6 $
*
*******************************************************************************
*/
@ -82,6 +82,16 @@ public class VerifyUCD implements UCD_Types {
Utility.fixDot();
System.out.println("checkCase");
ucd = UCD.make(Main.ucdVersion);
String test = "The qui'ck br\u2019own 'fox jum\u00ADped ov\u200Ber th\u200Ce lazy dog.";
String ttest = ucd.getCase(test, FULL, TITLE);
PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt");
titleTest.println(test);
titleTest.println(ttest);
titleTest.close();
initNormalizers();
System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
String fileName = "CaseDifferences.txt";

View file

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java,v $
* $Date: 2001/08/31 00:29:50 $
* $Revision: 1.2 $
* $Date: 2001/10/25 20:33:46 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -18,7 +18,34 @@ import java.io.*;
//import java.text.*;
import com.ibm.text.utility.*;
public class WriteJavaScriptInfo {
public class WriteJavaScriptInfo implements UCD_Types {
static public void assigned() throws IOException {
PrintWriter log = Utility.openPrintWriter("assigned.js");
UCD ucd = UCD.make();
boolean wasIn = false;
int lastWritten = -100;
int i;
for (i = 0; i <= 0x10FFFF; ++i) {
byte cat = ucd.getCategory(i);
boolean in = cat != Cn && cat != Co && cat != Cs;
if (wasIn == in) continue;
if (in) {
log.print(i + ",");
lastWritten = i;
} else {
if (lastWritten != i-1) log.print(i-1);
log.println(",");
}
wasIn = in;
}
if (wasIn) {
if (lastWritten != i-1) log.print(i-1);
log.println(",");
}
log.close();
}
/* TODO: fix enumeration of compositions
static public void writeJavascriptInfo() throws IOException {