mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 13:35:32 +00:00
additional derived properties
X-SVN-Rev: 6434
This commit is contained in:
parent
73c3b12dd4
commit
1dbf16c3c3
13 changed files with 797 additions and 351 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -14,55 +14,13 @@
|
|||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.*;
|
||||
import java.util.*;
|
||||
|
||||
public class DerivedProperty implements UCD_Types {
|
||||
|
||||
UCD ucdData;
|
||||
|
||||
static final int
|
||||
PropMath = 0,
|
||||
PropAlphabetic = 1,
|
||||
PropLowercase = 2,
|
||||
PropUppercase = 3,
|
||||
|
||||
ID_Start = 4,
|
||||
ID_Continue_NO_Cf = 5,
|
||||
|
||||
Mod_ID_Start = 6,
|
||||
Mod_ID_Continue_NO_Cf = 7,
|
||||
|
||||
Missing_Uppercase = 8,
|
||||
Missing_Lowercase = 9,
|
||||
Missing_Mixedcase = 10,
|
||||
|
||||
FC_NFKC_Closure = 11,
|
||||
|
||||
FullCompExclusion = 12,
|
||||
FullCompInclusion = 13,
|
||||
|
||||
QuickNFD = 14,
|
||||
QuickNFC = 15,
|
||||
QuickNFKD = 16,
|
||||
QuickNFKC = 17,
|
||||
|
||||
ExpandsOnNFD = 18,
|
||||
ExpandsOnNFC = 19,
|
||||
ExpandsOnNFKD = 20,
|
||||
ExpandsOnNFKC = 21,
|
||||
|
||||
GenNFD = 22,
|
||||
GenNFC = 23,
|
||||
GenNFKD = 24,
|
||||
GenNFKC = 25,
|
||||
|
||||
DefaultIgnorable = 26,
|
||||
GraphemeExtend = 27,
|
||||
GraphemeBase = 28,
|
||||
|
||||
FC_NFC_Closure = 29,
|
||||
|
||||
LIMIT = 30;
|
||||
|
||||
// ADD CONSTANT to UCD_TYPES
|
||||
|
||||
public DerivedProperty(UCD ucd) {
|
||||
ucdData = ucd;
|
||||
|
@ -74,9 +32,9 @@ public class DerivedProperty implements UCD_Types {
|
|||
else return "Unimplemented!!";
|
||||
}
|
||||
|
||||
public String getName(int propNumber) {
|
||||
public String getName(int propNumber, byte style) {
|
||||
DProp dp = dprops[propNumber];
|
||||
if (dp != null) return dp.getName();
|
||||
if (dp != null) return dp.getName(style);
|
||||
else return "Unimplemented!!";
|
||||
}
|
||||
|
||||
|
@ -87,10 +45,17 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
|
||||
public boolean isDefined(int propNumber) {
|
||||
if (propNumber < 0 || propNumber >= dprops.length) return false;
|
||||
return dprops[propNumber] != null;
|
||||
}
|
||||
|
||||
public boolean isTest(int propNumber) {
|
||||
if (!isDefined(propNumber)) return false;
|
||||
return dprops[propNumber].isTest();
|
||||
}
|
||||
|
||||
public boolean hasProperty(int cp, int propNumber) {
|
||||
if (!isDefined(propNumber)) return false;
|
||||
return dprops[propNumber].hasProperty(cp);
|
||||
}
|
||||
|
||||
|
@ -112,9 +77,15 @@ public class DerivedProperty implements UCD_Types {
|
|||
"Mixedcase"};
|
||||
|
||||
private abstract class DProp {
|
||||
String name, header;
|
||||
String getName() { return name; }
|
||||
boolean testStatus = false;
|
||||
byte defaultStyle = LONG;
|
||||
String name, shortName, header;
|
||||
String getName(byte style) {
|
||||
if (style == NORMAL) style = defaultStyle;
|
||||
return style < LONG ? shortName : name;
|
||||
}
|
||||
String getHeader() { return header; }
|
||||
boolean isTest() { return testStatus; }
|
||||
abstract boolean hasProperty(int cp);
|
||||
public boolean propertyVaries() { return false; }
|
||||
public String getProperty(int cp) { return hasProperty(cp) ? name : ""; }
|
||||
|
@ -125,6 +96,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
ExDProp(int i) {
|
||||
nfx = nf[i-ExpandsOnNFD];
|
||||
name = "Expands_On_" + NAME[i-ExpandsOnNFD];
|
||||
shortName = "XO_" + NAME[i-ExpandsOnNFD];
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters whose normalized length is not one."
|
||||
|
@ -139,11 +111,80 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
class NF_UnsafeStartProp extends DProp {
|
||||
Normalizer nfx;
|
||||
int prop;
|
||||
|
||||
NF_UnsafeStartProp(int i) {
|
||||
prop = i-NFD_UnsafeStart;
|
||||
nfx = nf[prop];
|
||||
name = NAME[prop] + "_UnsafeStart";
|
||||
shortName = NAME[prop] + "_SS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
|
||||
;
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
if (ucdData.getCombiningClass(cp) != 0) return false;
|
||||
String norm = nfx.normalize(cp);
|
||||
int first = UTF16.charAt(norm, 0);
|
||||
if (ucdData.getCombiningClass(first) != 0) return true;
|
||||
if ((prop == 1 || prop == 3)
|
||||
&& dprops[NFC_TrailingZero].hasProperty(first)) return true; // 1,3 == composing
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class NFC_Prop extends DProp {
|
||||
BitSet bitset;
|
||||
boolean filter = false;
|
||||
boolean keepNonZero = true;
|
||||
|
||||
NFC_Prop(int i) {
|
||||
BitSet[] bitsets = new BitSet[3];
|
||||
switch(i) {
|
||||
case NFC_Leading: bitsets[0] = bitset = new BitSet(); break;
|
||||
case NFC_Resulting: bitsets[2] = bitset = new BitSet(); break;
|
||||
case NFC_TrailingZero: keepNonZero = false; // FALL THRU
|
||||
case NFC_TrailingNonZero: bitsets[1] = bitset = new BitSet(); break;
|
||||
}
|
||||
filter = bitsets[1] != null;
|
||||
nfc.getCompositionStatus(bitsets[0], bitsets[1], bitsets[2]);
|
||||
|
||||
name = Names[i-NFC_Leading];
|
||||
shortName = SNames[i-NFC_Leading];
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# " + Description[i-NFC_Leading]
|
||||
+ "\r\n# NFKC characters are the same, after subtracting the NFKD = NO values."
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
boolean result = bitset.get(cp);
|
||||
if (result && filter) {
|
||||
result = (ucdData.getCombiningClass(cp) != 0) == keepNonZero;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
final String[] Names = {"NFC_Leading", "NFC_TrailingNonZero", "NFC_TrailingZero", "NFC_Resulting"};
|
||||
final String[] SNames = {"NFC_L", "NFC_TNZ", "NFC_TZ", "NFC_R"};
|
||||
final String[] Description = {
|
||||
"Characters that can combine with following characters in NFC",
|
||||
"Characters that can combine with previous characters in NFC, and have non-zero combining class",
|
||||
"Characters that can combine with previous characters in NFC, and have zero combining class",
|
||||
"Characters that can result from a combination of other characters in NFC",
|
||||
};
|
||||
};
|
||||
|
||||
class GenDProp extends DProp {
|
||||
Normalizer nfx;
|
||||
Normalizer nfComp = null;
|
||||
|
||||
GenDProp (int i) {
|
||||
testStatus = true;
|
||||
nfx = nf[i-GenNFD];
|
||||
name = NAME[i-GenNFD];
|
||||
String compName = "the character itself";
|
||||
|
@ -201,6 +242,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase];
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: NFKD has >0 " + CaseNames[i-Missing_Uppercase] + ", no other cases";
|
||||
testStatus = true;
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
|
@ -221,6 +263,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
NO = NAME[i-QuickNFD] + "_NO";
|
||||
MAYBE = NAME[i-QuickNFD] + "_MAYBE";
|
||||
name = NAME[i-QuickNFD] + "_QuickCheck";
|
||||
shortName = NAME[i-QuickNFD] + "_QC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from computing decomposibles"
|
||||
+ ((i == QuickNFC || i == QuickNFKC)
|
||||
|
@ -250,9 +293,18 @@ public class DerivedProperty implements UCD_Types {
|
|||
dprops[i] = new GenDProp(i);
|
||||
}
|
||||
|
||||
for (int i = NFC_Leading; i <= NFC_Resulting; ++i) {
|
||||
dprops[i] = new NFC_Prop(i);
|
||||
}
|
||||
|
||||
for (int i = NFD_UnsafeStart; i <= NFKC_UnsafeStart; ++i) {
|
||||
dprops[i] = new NF_UnsafeStartProp(i);
|
||||
}
|
||||
|
||||
dprops[ID_Start] = new DProp() {
|
||||
{
|
||||
name = "ID_Start";
|
||||
shortName = "IDS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Characters that can start an identifier."
|
||||
+ "\r\n# Generated from Lu+Ll+Lt+Lm+Lo+Nl";
|
||||
|
@ -265,6 +317,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
dprops[ID_Continue_NO_Cf] = new DProp() {
|
||||
{
|
||||
name = "ID_Continue";
|
||||
shortName = "IDC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Characters that can continue an identifier."
|
||||
+ "\r\n# Generated from: ID_Start + Mn+Mc+Nd+Pc"
|
||||
|
@ -278,6 +331,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
dprops[Mod_ID_Start] = new DProp() {
|
||||
{
|
||||
name = "XID_Start";
|
||||
shortName = "XIDS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# ID_Start modified for closure under NFKx"
|
||||
+ "\r\n# Modified as described in UAX #15"
|
||||
|
@ -292,6 +346,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
dprops[Mod_ID_Continue_NO_Cf] = new DProp() {
|
||||
{
|
||||
name = "XID_Continue";
|
||||
shortName = "XIDC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Mod_ID_Continue modified for closure under NFKx"
|
||||
+ "\r\n# Modified as described in UAX #15"
|
||||
|
@ -307,6 +362,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
dprops[PropMath] = new DProp() {
|
||||
{
|
||||
name = "Math";
|
||||
shortName = name;
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Sm + Other_Math";
|
||||
}
|
||||
|
@ -321,6 +377,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
dprops[PropAlphabetic] = new DProp() {
|
||||
{
|
||||
name = "Alphabetic";
|
||||
shortName = "Alpha";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Lu+Ll+Lt+Lm+Lo+Nl + Other_Alphabetic";
|
||||
}
|
||||
|
@ -335,6 +392,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
dprops[PropLowercase] = new DProp() {
|
||||
{
|
||||
name = "Lowercase";
|
||||
shortName = "Lower";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Ll + Other_Lowercase";
|
||||
}
|
||||
|
@ -349,6 +407,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
dprops[PropUppercase] = new DProp() {
|
||||
{
|
||||
name = "Uppercase";
|
||||
shortName = "Upper";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Lu + Other_Uppercase";
|
||||
}
|
||||
|
@ -373,7 +432,9 @@ of characters, the first of which has a non-zero combining class.
|
|||
*/
|
||||
dprops[FullCompExclusion] = new DProp() {
|
||||
{
|
||||
name = "Comp_Ex";
|
||||
name = "Full_Composition_Exclusion";
|
||||
shortName = "Comp_Ex";
|
||||
defaultStyle = SHORT;
|
||||
header = "# Derived Property: " + name
|
||||
+ ": Full Composition Exclusion"
|
||||
+ "\r\n# Generated from: Composition Exclusions + Singletons + Non-Starter Decompositions";
|
||||
|
@ -390,7 +451,10 @@ of characters, the first of which has a non-zero combining class.
|
|||
|
||||
dprops[FullCompInclusion] = new DProp() {
|
||||
{
|
||||
name = "Comp_In";
|
||||
name = "Full_Composition_Inclusion";
|
||||
shortName = "Comp_In";
|
||||
defaultStyle = SHORT;
|
||||
testStatus = true;
|
||||
header = "# Derived Property: " + name
|
||||
+ ": Full Composition Inclusion"
|
||||
+ "\r\n# characters with Canonical Decompositions MINUS Full Composition Exclusion";
|
||||
|
@ -408,6 +472,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
dprops[FC_NFKC_Closure] = new DProp() {
|
||||
{
|
||||
name = "FC_NFKC_Closure";
|
||||
shortName = "FC_NFKC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from computing: b = NFKC(Fold(a)); c = NFKC(Fold(b));"
|
||||
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
|
||||
|
@ -427,6 +492,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
dprops[FC_NFC_Closure] = new DProp() {
|
||||
{
|
||||
name = "FC_NFC_Closure";
|
||||
shortName = "FC_NFC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from computing: b = NFC(Fold(a)); c = NFC(Fold(b));"
|
||||
+ "\r\n# Then if (c != b) add the mapping from a to c to the set of"
|
||||
|
@ -450,8 +516,9 @@ of characters, the first of which has a non-zero combining class.
|
|||
dprops[DefaultIgnorable] = new DProp() {
|
||||
{
|
||||
name = "Default_Ignorable_Code_Point";
|
||||
shortName = "DI";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - WhiteSpace";
|
||||
+ "\r\n# Generated from Other_Default_Ignorable_Code_Point + Cf + Cc + Cs - White_Space";
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
if (ucdData.getBinaryProperty(cp, White_space)) return false;
|
||||
|
@ -471,11 +538,12 @@ of characters, the first of which has a non-zero combining class.
|
|||
*/
|
||||
dprops[GraphemeExtend] = new DProp() {
|
||||
{
|
||||
name = "GraphemeExtend";
|
||||
name = "Grapheme_Extend";
|
||||
shortName = "GrExt";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Me + Mn + Mc + Other_GraphemeExtend - GraphemeLink"
|
||||
+ "\r\n# Generated from: Me + Mn + Mc + Other_Grapheme_Extend - Grapheme_Link"
|
||||
+ "\r\n# Used in the definition of GraphemeCluster: "
|
||||
+ "\r\n# GraphemeCluster ::= GraphameBase? ( GraphemeExtend | GraphemeLink Join_Control? GraphemeBase? )*";
|
||||
+ "\r\n# GraphemeCluster ::= GraphameBase? ( Grapheme_Extend | Grapheme_Link Join_Control? Grapheme_Base? )*";
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
if (ucdData.getBinaryProperty(cp, GraphemeExtend)) return false;
|
||||
|
@ -486,13 +554,80 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
};
|
||||
|
||||
dprops[Other_Case_Ignorable] = new DProp() {
|
||||
{
|
||||
name = "Other_Case_Ignorable";
|
||||
shortName = "OCI";
|
||||
|
||||
header = header = "# Binary Property";
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
switch(cp) {
|
||||
case 0x27: case 0x2019: case 0xAD: return true;
|
||||
// case 0x2d: case 0x2010: case 0x2011:
|
||||
/*
|
||||
0027 ; Other_Case_Ignorable # Po APOSTROPHE
|
||||
00AD ; Other_Case_Ignorable # Pd SOFT HYPHEN
|
||||
2019 ; Other_Case_Ignorable # Pf RIGHT SINGLE QUOTATION MARK
|
||||
*/
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Type_i] = new DProp() {
|
||||
{
|
||||
name = "Special_Dotted";
|
||||
shortName = "SDot";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: all characters whose canonical decompositions end with a combining character sequence that"
|
||||
+ "\r\n# - starts with i or j"
|
||||
+ "\r\n# - has no combining marks above"
|
||||
+ "\r\n# - has no combining marks with zero canonical combining class"
|
||||
;
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
if (cp == 'i' || cp == 'j') return true;
|
||||
if (!nfkd.hasDecomposition(cp)) return false;
|
||||
String decomp = nfd.normalize(cp);
|
||||
boolean ok = false;
|
||||
for (int i = decomp.length()-1; i >= 0; --i) {
|
||||
char ch = decomp.charAt(i);
|
||||
int cc = ucdData.getCombiningClass(ch);
|
||||
if (cc == 230) return false;
|
||||
if (cc == 0) {
|
||||
if (ch == 'i' || ch == 'j') ok = true;
|
||||
else return false;
|
||||
}
|
||||
}
|
||||
return ok;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[Case_Ignorable] = new DProp() {
|
||||
{
|
||||
name = "Case_Ignorable";
|
||||
shortName = "CI";
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: Other_Case_Ignorable + Lm + Mn + Me + Cf";
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
if (cat == Lm || cat == Cf || cat == Mn || cat == Me) return true;
|
||||
if (dprops[Other_Case_Ignorable].hasProperty(cp)) return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
dprops[GraphemeBase] = new DProp() {
|
||||
{
|
||||
name = "GraphemeBase";
|
||||
name = "Grapheme_Base";
|
||||
shortName = "GrBase";
|
||||
|
||||
header = header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - GraphemeLink - GraphemeExtend"
|
||||
+ "\r\n# Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Link - Grapheme_Extend"
|
||||
+ "\r\n# Used in the definition of GraphemeCluster: "
|
||||
+ "\r\n# GraphemeCluster ::= GraphameBase? ( GraphemeExtend | GraphemeLink Join_Control? GraphemeBase? )*";
|
||||
+ "\r\n# GraphemeCluster ::= GraphameBase? ( Grapheme_Extend | Grapheme_Link Join_Control? Grapheme_Base? )*";
|
||||
}
|
||||
boolean hasProperty(int cp) {
|
||||
byte cat = ucdData.getCategory(cp);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DiffPropertyLister.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -25,7 +25,7 @@ class DiffPropertyLister extends PropertyLister {
|
|||
}
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return ucdData.getVersion();
|
||||
return major_minor_only(ucdData.getVersion());
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -49,9 +49,10 @@ class DiffPropertyLister extends PropertyLister {
|
|||
|
||||
public String headerString() {
|
||||
if (oldUCD != null) {
|
||||
return "# Differences between " + ucdData.getVersion() + " and " + oldUCD.getVersion();
|
||||
return "# Differences between " + major_minor_only(ucdData.getVersion())
|
||||
+ " and " + major_minor_only(oldUCD.getVersion());
|
||||
} else {
|
||||
return "# Allocated as of " + ucdData.getVersion();
|
||||
return "# Designated as of " + major_minor_only(ucdData.getVersion());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -80,6 +81,10 @@ class DiffPropertyLister extends PropertyLister {
|
|||
return count;
|
||||
}
|
||||
*/
|
||||
|
||||
private String major_minor_only(String s) {
|
||||
return s.substring(0, s.lastIndexOf('.'));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateData.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -19,19 +19,25 @@ import java.text.DateFormat;
|
|||
import java.text.SimpleDateFormat;
|
||||
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UTF16;
|
||||
|
||||
|
||||
public class GenerateData implements UCD_Types {
|
||||
|
||||
static UnifiedBinaryProperty ubp;
|
||||
|
||||
public static void main (String inVersion, String[] args) throws IOException {
|
||||
System.out.println("START");
|
||||
ucd = UCD.make(inVersion);
|
||||
ubp = new UnifiedBinaryProperty(ucd);
|
||||
|
||||
System.out.println("Loaded UCD " + ucd.getVersion() + " " + (new Date(ucd.getDate())));
|
||||
String version = ucd.getVersion();
|
||||
|
||||
for (int i = 0; i < args.length; ++i) {
|
||||
String arg = args[i];
|
||||
if (arg.charAt(0) == '#') return; // skip rest of line
|
||||
int mask = 0;
|
||||
long mask = 0;
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("Argument: " + args[i]);
|
||||
|
@ -39,7 +45,16 @@ public class GenerateData implements UCD_Types {
|
|||
if (arg.equalsIgnoreCase("partition")) {
|
||||
partitionProperties();
|
||||
} else if (arg.equalsIgnoreCase("list")) {
|
||||
listProperties();
|
||||
listProperties();
|
||||
} else if (arg.equalsIgnoreCase("listAccents")) {
|
||||
listCombiningAccents();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listGreekVowels")) {
|
||||
listGreekVowels();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("listKatakana")) {
|
||||
listKatakana();
|
||||
|
||||
} else if (arg.equalsIgnoreCase("diff")) {
|
||||
listDifferences();
|
||||
} else if (arg.equalsIgnoreCase("DerivedBidiClass")) {
|
||||
|
@ -91,6 +106,18 @@ public class GenerateData implements UCD_Types {
|
|||
mask = Utility.setBits(mask, DerivedProperty.DefaultIgnorable, DerivedProperty.FC_NFC_Closure-1);
|
||||
generateDerived(mask, HEADER_DERIVED, "DerivedCoreProperties-" + version );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("caseignorable")) {
|
||||
mask = Utility.setBits(0, DerivedProperty.Other_Case_Ignorable, DerivedProperty.Type_i);
|
||||
generateDerived(mask, HEADER_DERIVED, "CaseIgnorable-" + version );
|
||||
|
||||
} else if (arg.equalsIgnoreCase("nfcprops")) {
|
||||
mask = Utility.setBits(0, NFC_Leading, NFC_Resulting);
|
||||
generateDerived(mask, HEADER_DERIVED, "NFKC_SafeStart-" + version);
|
||||
|
||||
} else if (arg.equalsIgnoreCase("nfunsafestart")) {
|
||||
mask = Utility.setBits(0, NFD_UnsafeStart, NFKC_UnsafeStart);
|
||||
generateDerived(mask, HEADER_DERIVED, "NFUnsafeStart-" + version);
|
||||
|
||||
} else if (arg.equalsIgnoreCase("DerivedAge")) {
|
||||
generateAge("DerivedAge-" + version );
|
||||
|
||||
|
@ -202,11 +229,11 @@ public class GenerateData implements UCD_Types {
|
|||
output.println();
|
||||
}
|
||||
|
||||
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
|
||||
public static void generateDerived (long bitMask, int headerChoice, String fileName) throws IOException {
|
||||
PrintWriter output = Utility.openPrintWriter(fileName + "dX.txt");
|
||||
doHeader(fileName, output, headerChoice);
|
||||
for (int i = 0; i < DerivedProperty.LIMIT; ++i) {
|
||||
if ((bitMask & (1<<i)) == 0) continue;
|
||||
if ((bitMask & (1L<<i)) == 0) continue;
|
||||
System.out.print('.');
|
||||
output.println("# ================================================");
|
||||
output.println();
|
||||
|
@ -277,7 +304,7 @@ public class GenerateData implements UCD_Types {
|
|||
for (int i = 1; i < LIMIT_ENUM; ++i) { // || iType == SCRIPT
|
||||
int iType = i & 0xFF00;
|
||||
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
if (!ubp.isDefined(i)) continue;
|
||||
props[count++] = i;
|
||||
}
|
||||
System.out.println("props: " + count);
|
||||
|
@ -292,7 +319,7 @@ public class GenerateData implements UCD_Types {
|
|||
if (!ucd.isAllocated(cp)) continue;
|
||||
|
||||
for (int i = 0; i < count; ++i) {
|
||||
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, props[i]);
|
||||
boolean iProp = ubp.get(cp, props[i]);
|
||||
if (iProp) probe.set(i); else probe.clear(i);
|
||||
}
|
||||
|
||||
|
@ -315,9 +342,9 @@ public class GenerateData implements UCD_Types {
|
|||
for (int i = 1; i < LIMIT_ENUM; ++i) {
|
||||
int iType = i & 0xFF00;
|
||||
if (iType == JOINING_GROUP || iType == AGE || iType == COMBINING_CLASS || iType == SCRIPT) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
String iNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
|
||||
String iNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
|
||||
if (!ubp.isDefined(i)) continue;
|
||||
String iNameShort = ubp.getFullID(i, SHORT);
|
||||
String iNameLong = ubp.getFullID(i, LONG);
|
||||
|
||||
System.out.println();
|
||||
System.out.println();
|
||||
|
@ -329,7 +356,7 @@ public class GenerateData implements UCD_Types {
|
|||
int jType = j & 0xFF00;
|
||||
if (jType == JOINING_GROUP || jType == AGE || jType == COMBINING_CLASS || jType == SCRIPT
|
||||
|| (jType == iType && jType != BINARY_PROPERTIES)) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, j)) continue;
|
||||
if (!ubp.isDefined(j)) continue;
|
||||
|
||||
if ((j >> 8) != last) {
|
||||
last = j >> 8;
|
||||
|
@ -349,8 +376,8 @@ public class GenerateData implements UCD_Types {
|
|||
if (cat == UNASSIGNED || cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (!ucd.isAllocated(cp)) continue;
|
||||
|
||||
boolean iProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, i);
|
||||
boolean jProp = MyPropertyLister.getUnifiedBinaryProperty(ucd, cp, j);
|
||||
boolean iProp = ubp.get(cp, i);
|
||||
boolean jProp = ubp.get(cp, j);
|
||||
|
||||
if (jProp) ++jCount;
|
||||
if (iProp) {
|
||||
|
@ -361,8 +388,8 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
if (iCount == 0 || jCount == 0) continue;
|
||||
|
||||
String jNameShort = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.SHORT);
|
||||
//String jNameLong = MyPropertyLister.getFullUnifiedBinaryPropertyID(ucd, j, MyPropertyLister.LONG);
|
||||
String jNameShort = ubp.getFullID(j, SHORT);
|
||||
//String jNameLong = ubp.getFullID(j, LONG);
|
||||
|
||||
String rel = bothCount == 0 ? "DISJOINT"
|
||||
: i_jPropCount == 0 && j_iPropCount == 0 ? "EQUALS"
|
||||
|
@ -384,26 +411,69 @@ public class GenerateData implements UCD_Types {
|
|||
}
|
||||
|
||||
|
||||
public static void listProperties() {
|
||||
public static void listProperties() throws IOException {
|
||||
String propAbb = "";
|
||||
String prop = "";
|
||||
|
||||
Map duplicates = new TreeMap();
|
||||
Set sorted = new TreeSet(java.text.Collator.getInstance());
|
||||
String spacing;
|
||||
|
||||
for(int k = 0; k < UCD_Names.NON_ENUMERATED.length; ++k) {
|
||||
propAbb = UCD_Names.NON_ENUMERATED[k][0];
|
||||
prop = UCD_Names.NON_ENUMERATED[k][1];
|
||||
spacing = Utility.repeat(" ", 10-propAbb.length());
|
||||
sorted.add("AA; " + propAbb + spacing + "; " + prop);
|
||||
checkDuplicate(duplicates, propAbb, prop);
|
||||
if (!prop.equals(propAbb)) checkDuplicate(duplicates, prop, prop);
|
||||
}
|
||||
|
||||
sorted.add("xx; T ; True");
|
||||
checkDuplicate(duplicates, "T", "xx");
|
||||
sorted.add("xx; F ; False");
|
||||
checkDuplicate(duplicates, "F", "xx");
|
||||
sorted.add("qc; Y ; Yes");
|
||||
checkDuplicate(duplicates, "Y", "qc");
|
||||
sorted.add("qc; N ; No");
|
||||
checkDuplicate(duplicates, "Y", "qc");
|
||||
sorted.add("qc; M ; Maybe");
|
||||
checkDuplicate(duplicates, "Y", "qc");
|
||||
|
||||
|
||||
for (int i = 0; i < LIMIT_ENUM; ++i) {
|
||||
int type = i & 0xFF00;
|
||||
if (type == JOINING_GROUP || type == AGE) continue;
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
String value = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.LONG);
|
||||
if (type == AGE) continue;
|
||||
if (i == (BINARY_PROPERTIES | CaseFoldTurkishI)) continue;
|
||||
|
||||
if (type == i && type != BINARY_PROPERTIES && type != DERIVED) {
|
||||
propAbb = ubp.getPropertyName(i, SHORT);
|
||||
prop = ubp.getPropertyName(i, LONG);
|
||||
spacing = Utility.repeat(" ", 10-propAbb.length());
|
||||
sorted.add("BB; " + propAbb + spacing + "; " + prop);
|
||||
checkDuplicate(duplicates, propAbb, prop);
|
||||
if (!prop.equals(propAbb)) checkDuplicate(duplicates, prop, prop);
|
||||
}
|
||||
|
||||
if (!ubp.isDefined(i)) continue;
|
||||
if (ubp.isTest(i)) continue;
|
||||
|
||||
String value = ubp.getID(i, LONG);
|
||||
if (value.length() == 0) value = "none";
|
||||
else if (value.equals("<unused>")) continue;
|
||||
String abbvalue = MyPropertyLister.getUnifiedBinaryPropertyID(ucd, i, MyPropertyLister.SHORT);
|
||||
value = fixGaps(value);
|
||||
|
||||
if (type == SCRIPT) {
|
||||
value = ucd.getCase(value, FULL, TITLE);
|
||||
}
|
||||
|
||||
String abbvalue = ubp.getID(i, SHORT);
|
||||
if (abbvalue.length() == 0) abbvalue = "no";
|
||||
|
||||
if (type == COMBINING_CLASS) {
|
||||
value = MyPropertyLister.getCombiningName(i);
|
||||
if (value.length() == 0) {
|
||||
if ((i & 0xFF) == 0) value = "99";
|
||||
else continue;
|
||||
}
|
||||
abbvalue = value;
|
||||
if (value.startsWith("Fixed_")) { continue; }
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
String elide = "";
|
||||
if (type == CATEGORY || type == SCRIPT || type == BINARY_PROPERTIES) elide = "\\p{"
|
||||
+ abbvalue
|
||||
|
@ -421,9 +491,78 @@ public class GenerateData implements UCD_Types {
|
|||
+ value
|
||||
+ "}";
|
||||
System.out.println("<tr><td>" + elide + "</td><td>" + abb + "</td><td>" + norm + "</td></tr>");
|
||||
*/
|
||||
|
||||
spacing = Utility.repeat(" ", 10-abbvalue.length());
|
||||
|
||||
if (type == BINARY_PROPERTIES || type == DERIVED) {
|
||||
sorted.add("ZZ; " + abbvalue + spacing + "; " + value);
|
||||
checkDuplicate(duplicates, value, value);
|
||||
if (!value.equals(abbvalue)) checkDuplicate(duplicates, abbvalue, value);
|
||||
continue;
|
||||
}
|
||||
|
||||
sorted.add(propAbb + "; " + abbvalue + spacing + "; " + value);
|
||||
checkDuplicate(duplicates, value, prop + "=" + value);
|
||||
if (!value.equals(abbvalue)) checkDuplicate(duplicates, abbvalue, prop + "=" + value);
|
||||
}
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter("PropertyAliases.txt");
|
||||
Utility.appendFile("PropertyAliasHeader.txt", log);
|
||||
Utility.print(log, sorted, "\r\n", new MyBreaker());
|
||||
log.close();
|
||||
}
|
||||
|
||||
static class MyBreaker implements Utility.Breaker {
|
||||
public String get(Object current, Object old) {
|
||||
if (old == null) return "";
|
||||
String c = current.toString();
|
||||
String o = old.toString();
|
||||
if (c.length() >= 2 && o.length() >= 0 && !c.substring(0,2).equals(o.substring(0,2))) {
|
||||
return "\r\n";
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void checkDuplicate(Map m, String toCheck, String comment) {
|
||||
String result = (String) m.get(toCheck);
|
||||
if (result != null) {
|
||||
System.out.println("Collision with " + toCheck);
|
||||
System.out.println(" Between " + comment);
|
||||
System.out.println(" And " + result);
|
||||
} else {
|
||||
m.put(skeleton(toCheck), comment);
|
||||
}
|
||||
}
|
||||
|
||||
static String fixGaps(String source) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
byte lastCat = -1;
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char c = source.charAt(i);
|
||||
if (c == ' ' || c == '-') c = '_';
|
||||
byte cat = ucd.getCategory(c);
|
||||
if (lastCat == Ll && cat == Lu) {
|
||||
result.append('_');
|
||||
}
|
||||
result.append(c);
|
||||
lastCat = cat;
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static String skeleton(String source) {
|
||||
StringBuffer result = new StringBuffer();
|
||||
source = source.toLowerCase();
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
char c = source.charAt(i);
|
||||
if (c < 'a' || c > 'z') continue;
|
||||
result.append(c);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
static final byte KEEP_SPECIAL = 0, SKIP_SPECIAL = 1;
|
||||
|
||||
public static void generateVerticalSlice(int startEnum, int endEnum, byte skipSpecial,
|
||||
|
@ -445,7 +584,7 @@ public class GenerateData implements UCD_Types {
|
|||
doHeader(file, output, headerChoice);
|
||||
int last = -1;
|
||||
for (int i = startEnum; i < endEnum; ++i) {
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
if (!ubp.isDefined(i)) continue;
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
|| i == (BINARY_PROPERTIES | Non_break)
|
||||
|| i == (BINARY_PROPERTIES | CaseFoldTurkishI)
|
||||
|
@ -689,14 +828,19 @@ public class GenerateData implements UCD_Types {
|
|||
static final void generateAge(String filename) throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter(filename + "dX.txt");
|
||||
try {
|
||||
log.println("# Derived file showing when various code points were allocated in Unicode");
|
||||
log.println("# Derived file showing when various code points were designated in Unicode");
|
||||
log.println("# author: M. Davis");
|
||||
log.println("# generated: " + new Date());
|
||||
log.println("# Notes:");
|
||||
log.println("# - The old Hangul Syllables (removed from 2.0) are not included in the 1.1.0 listing.");
|
||||
log.println("# - The term 'designated' means that a previously reserved code point was specified");
|
||||
log.println("# to be a noncharacter or surrogate, or assigned as a character,");
|
||||
log.println("# control or format code.");
|
||||
log.println("# - Versions are only tracked from 1.1 onwards, since version 1.0");
|
||||
log.println("# predated changes required by the ISO 10646 merger.");
|
||||
log.println("# - The Hangul Syllables that were removed from 2.0 are not included in the 1.1 listing.");
|
||||
log.println("# - The supplementary private use code points and the non-character code points");
|
||||
log.println("# were allocated in version 2.0, but not specifically listed in the UCD");
|
||||
log.println("# until versions 3.0.1 and 3.1.0 respectively.");
|
||||
log.println("# were designated in version 2.0, but not specifically listed in the UCD");
|
||||
log.println("# until versions 3.0 and 3.1 respectively.");
|
||||
|
||||
log.println("# ================================================");
|
||||
log.println();
|
||||
|
@ -713,6 +857,9 @@ public class GenerateData implements UCD_Types {
|
|||
log.println("# ================================================");
|
||||
log.println();
|
||||
new DiffPropertyLister("3.0.0", "3.1.0", log).print();
|
||||
log.println("# ================================================");
|
||||
log.println();
|
||||
new DiffPropertyLister("3.1.0", "3.2.0", log).print();
|
||||
/*
|
||||
printDiff("110", "200");
|
||||
UnicodeSet u11 = fromFile(BASE_DIR + "UnicodeData\\Versions\\UnicodeData-1.1.txt", false);
|
||||
|
@ -761,5 +908,133 @@ public class GenerateData implements UCD_Types {
|
|||
|
||||
}
|
||||
|
||||
public static void listCombiningAccents() throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter("ListAccents.txt");
|
||||
Normalizer nfd = new Normalizer(Normalizer.NFD);
|
||||
Set set = new TreeSet();
|
||||
Set set2 = new TreeSet();
|
||||
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
Utility.dot(i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
|
||||
if (!nfd.hasDecomposition(i)) {
|
||||
if (ucd.getScript(i) == LATIN_SCRIPT) {
|
||||
int cp = i;
|
||||
String hex = "u" + Utility.hex(cp, 4);
|
||||
set.add("# yyy $x <> \\" + hex + " ; # " + ucd.getName(cp));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
String decomp = nfd.normalize(i);
|
||||
int j;
|
||||
for (j = 0; j < decomp.length(); j += UTF16.getCharCount(i)) {
|
||||
int cp = UTF16.charAt(decomp, j);
|
||||
byte cat = ucd.getCategory(cp);
|
||||
if (cat != Mn) continue;
|
||||
String hex = "u" + Utility.hex(cp, 4);
|
||||
set.add("# xxx $x <> \\" + hex + " ; # " + ucd.getName(cp));
|
||||
}
|
||||
}
|
||||
|
||||
Iterator it = set.iterator();
|
||||
while (it.hasNext()) {
|
||||
log.println(it.next());
|
||||
}
|
||||
log.close();
|
||||
}
|
||||
|
||||
public static void listGreekVowels() throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter("ListGreekVowels.txt");
|
||||
Normalizer nfd = new Normalizer(Normalizer.NFD);
|
||||
Normalizer nfc = new Normalizer(Normalizer.NFC);
|
||||
Set set = new TreeSet();
|
||||
Set set2 = new TreeSet();
|
||||
|
||||
String vowels = "\u03B1\u03B5\u03B7\u03B9\u03BF\u03C5\u03C9\u0391\u0395\u0397\u0399\u039F\u03A5\u03A9";
|
||||
String diphthongEnd = "\u03B9\u03C5\u0399\u03A5";
|
||||
String diphthongStart = "\u03B1\u03B5\u03B7\u03BF\u03C5\u0391\u0395\u0397\u039F\u03A5";
|
||||
String etas = "\u03B7\u0397";
|
||||
String iotas = "\u03B9\u0399";
|
||||
|
||||
for (char i = 0; i < 0xFFFF; ++i) {
|
||||
Utility.dot(i);
|
||||
if (!ucd.isRepresented(i)) continue;
|
||||
if (ucd.getScript(i) != GREEK_SCRIPT) continue;
|
||||
String decomp = nfd.normalize(i);
|
||||
|
||||
if (decomp.indexOf('\u0306') >= 0) continue; // skip breve
|
||||
if (decomp.indexOf('\u0304') >= 0) continue; // skip macron
|
||||
|
||||
String comp = nfc.normalize(decomp);
|
||||
if (!comp.equals(String.valueOf(i))) continue; // skip compats
|
||||
|
||||
char first = decomp.charAt(0);
|
||||
|
||||
if (vowels.indexOf(first) < 0) continue;
|
||||
|
||||
String h = "";
|
||||
if (decomp.indexOf('\u0314') >= 0) h = "\uFFFF";
|
||||
|
||||
if (diphthongEnd.indexOf(first) >= 0) {
|
||||
for (int j = 0; j < diphthongStart.length(); ++j) {
|
||||
String v = diphthongStart.substring(j, j+1);
|
||||
char vc = v.charAt(0);
|
||||
if (ucd.getCategory(vc) == Ll && ucd.getCategory(first) == Lu) continue;
|
||||
if (etas.indexOf(vc) >= 0 && iotas.indexOf(first) >= 0) continue;
|
||||
set.add(new Pair(h + v + first, new Pair(v + decomp, v + i)));
|
||||
}
|
||||
}
|
||||
set.add(new Pair(h+first, new Pair(decomp, String.valueOf(i))));
|
||||
}
|
||||
|
||||
Iterator it = set.iterator();
|
||||
Object last = "";
|
||||
while (it.hasNext()) {
|
||||
Pair p = (Pair) it.next();
|
||||
if (!last.equals(p.first)) {
|
||||
log.println();
|
||||
last = p.first;
|
||||
} else {
|
||||
log.print(", ");
|
||||
}
|
||||
p = (Pair) p.second;
|
||||
log.print(p.second);
|
||||
}
|
||||
log.close();
|
||||
}
|
||||
|
||||
public static void listKatakana() throws IOException {
|
||||
|
||||
for (char i = 'a'; i <= 'z'; ++i) {
|
||||
doKana(String.valueOf(i));
|
||||
if (i == 'c') doKana("ch");
|
||||
if (i == 's') doKana("sh");
|
||||
if (i == 'd') {
|
||||
doKana("dz");
|
||||
doKana("dj");
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
public static void doKana(String i) {
|
||||
|
||||
String vowels = "aeiou";
|
||||
System.out.println();
|
||||
System.out.print(i + " " + i + i);
|
||||
System.out.println();
|
||||
for (int j = 0; j < vowels.length(); ++j) {
|
||||
char c = vowels.charAt(j);
|
||||
System.out.print(" " + i + c);
|
||||
}
|
||||
|
||||
System.out.println();
|
||||
for (int j = 0; j < vowels.length(); ++j) {
|
||||
char c = vowels.charAt(j);
|
||||
System.out.print(" " + i + "y" + c);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -37,6 +37,7 @@ public final class Main {
|
|||
} else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
|
||||
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
|
||||
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
|
||||
else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main();
|
||||
|
||||
else if (arg.equalsIgnoreCase("testDerivedProperties")) DerivedProperty.test();
|
||||
else if (arg.equalsIgnoreCase("checkCase")) VerifyUCD.checkCase();
|
||||
|
@ -49,8 +50,9 @@ public final class Main {
|
|||
//else if (arg.equalsIgnoreCase("checkAgainstUInfo")) checkAgainstUInfo();
|
||||
else if (arg.equalsIgnoreCase("checkScripts")) VerifyUCD.checkScripts();
|
||||
else if (arg.equalsIgnoreCase("IdentifierTest")) VerifyUCD.IdentifierTest();
|
||||
else if (arg.equalsIgnoreCase("GenerateData")) GenerateData.main(ucdVersion, Utility.split(args[++i],','));
|
||||
else if (arg.equalsIgnoreCase("Generate")) GenerateData.main(ucdVersion, Utility.split(args[++i],','));
|
||||
else if (arg.equalsIgnoreCase("BuildNames")) BuildNames.main(null);
|
||||
else if (arg.equalsIgnoreCase("JavascriptProperties")) WriteJavaScriptInfo.assigned();
|
||||
else if (arg.equalsIgnoreCase("writeNormalizerTestSuite"))
|
||||
GenerateData.writeNormalizerTestSuite("NormalizationTest-3.1.1d1.txt");
|
||||
else {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/MyPropertyLister.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -21,64 +21,36 @@ final class MyPropertyLister extends PropertyLister {
|
|||
static final boolean BRIDGE = false;
|
||||
|
||||
private int propMask;
|
||||
|
||||
UnifiedBinaryProperty ubp;
|
||||
|
||||
public MyPropertyLister(UCD ucd, int propMask, PrintWriter output) {
|
||||
this.propMask = propMask;
|
||||
this.output = output;
|
||||
this.ucdData = ucd;
|
||||
ubp = new UnifiedBinaryProperty(ucd);
|
||||
if (propMask < COMBINING_CLASS) usePropertyComment = false; // skip gen cat
|
||||
}
|
||||
|
||||
static String getCombiningName (int propMask) {
|
||||
String s = "";
|
||||
switch (propMask & 0xFF) {
|
||||
case 0: s = "NotReordered"; break;
|
||||
case 1: s = "Overlay"; break;
|
||||
case 7: s = "Nukta"; break;
|
||||
case 8: s = "KanaVoicing"; break;
|
||||
case 9: s = "Virama"; break;
|
||||
case 202: s = "AttachedBelowLeft"; break;
|
||||
case 204: s = "AttachedBelow"; break;
|
||||
case 206: s = "AttachedBelowRight"; break;
|
||||
case 208: s = "AttachedLeft"; break;
|
||||
case 210: s = "AttachedRight"; break;
|
||||
case 212: s = "AttachedAboveLeft"; break;
|
||||
case 214: s = "AttachedAbove"; break;
|
||||
case 216: s = "AttachedAboveRight"; break;
|
||||
case 218: s = "BelowLeft"; break;
|
||||
case 220: s = "Below"; break;
|
||||
case 222: s = "BelowRight"; break;
|
||||
case 224: s = "Left"; break;
|
||||
case 226: s = "Right"; break;
|
||||
case 228: s = "AboveLeft"; break;
|
||||
case 230: s = "Above"; break;
|
||||
case 232: s = "AboveRight"; break;
|
||||
case 233: s = "DoubleBelow"; break;
|
||||
case 234: s = "DoubleAbove"; break;
|
||||
case 240: s = "IotaSubscript"; break;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public String headerString() {
|
||||
int main = (propMask & 0xFF00);
|
||||
if (main == COMBINING_CLASS) {
|
||||
String s = getCombiningName(propMask);
|
||||
if (s.length() == 0) s = "Other Combining Class";
|
||||
String s = UCD.getCombiningID_fromIndex((short)(propMask & 0xFF), LONG);
|
||||
if (s.startsWith("Fixed")) s = "Other Combining Class";
|
||||
return "# " + s;
|
||||
} else if (main == BINARY_PROPERTIES) {
|
||||
return "";
|
||||
} else if (main == JOINING_GROUP) {
|
||||
return "";
|
||||
} else {
|
||||
String shortID = getUnifiedBinaryPropertyID(ucdData, propMask, SHORT);
|
||||
String longID = getUnifiedBinaryPropertyID(ucdData, propMask, LONG);
|
||||
String shortID = ubp.getID(propMask, SHORT);
|
||||
String longID = ubp.getID(propMask, LONG);
|
||||
return "# " + shortID + (shortID.equals(longID) ? "" : "\t(" + longID + ")");
|
||||
}
|
||||
}
|
||||
|
||||
public String propertyName(int cp) {
|
||||
return getUnifiedBinaryPropertyID(propMask);
|
||||
return ubp.getID(propMask);
|
||||
}
|
||||
|
||||
public String optionalComment(int cp) {
|
||||
|
@ -115,7 +87,7 @@ final class MyPropertyLister extends PropertyLister {
|
|||
else return EXCLUDE;
|
||||
}
|
||||
|
||||
boolean inSet = getUnifiedBinaryProperty(cp, propMask);
|
||||
boolean inSet = ubp.get(cp, propMask);
|
||||
/*
|
||||
if (cp >= 0x1D400 && cp <= 0x1D7C9 && cat != Cn) {
|
||||
if (propMask == (SCRIPT | LATIN_SCRIPT)) inSet = cp <= 0x1D6A3;
|
||||
|
@ -133,151 +105,6 @@ final class MyPropertyLister extends PropertyLister {
|
|||
return INCLUDE;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return unified property number
|
||||
*/
|
||||
public static boolean isUnifiedBinaryPropertyDefined(UCD ucd, int propMask) {
|
||||
int enum = propMask >> 8;
|
||||
propMask &= 0xFF;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: return propMask != UNUSED_CATEGORY && propMask < LIMIT_CATEGORY;
|
||||
case COMBINING_CLASS>>8: return ucd.isCombiningClassUsed((byte)propMask);
|
||||
case BIDI_CLASS>>8: return propMask != BIDI_UNUSED && propMask < LIMIT_BIDI_CLASS;
|
||||
case DECOMPOSITION_TYPE>>8: return propMask < LIMIT_DECOMPOSITION_TYPE;
|
||||
case NUMERIC_TYPE>>8: return propMask < LIMIT_NUMERIC_TYPE;
|
||||
case EAST_ASIAN_WIDTH>>8: return propMask < LIMIT_EAST_ASIAN_WIDTH;
|
||||
case LINE_BREAK>>8: return propMask < LIMIT_LINE_BREAK;
|
||||
case JOINING_TYPE>>8: return propMask < LIMIT_JOINING_TYPE;
|
||||
case JOINING_GROUP>>8: return propMask < LIMIT_JOINING_GROUP;
|
||||
case BINARY_PROPERTIES>>8: return propMask < LIMIT_BINARY_PROPERTIES;
|
||||
case SCRIPT>>8: return propMask != UNUSED_SCRIPT && propMask < LIMIT_SCRIPT;
|
||||
case AGE>>8: return propMask < LIMIT_AGE;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean getUnifiedBinaryProperty(int cp, int propMask) {
|
||||
return getUnifiedBinaryProperty(ucdData, cp, propMask);
|
||||
}
|
||||
|
||||
static public boolean getUnifiedBinaryProperty(UCD ucd, int cp, int propMask) {
|
||||
int enum = propMask >> 8;
|
||||
propMask &= 0xFF;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
|
||||
return ucd.getCategory(cp) == propMask;
|
||||
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
|
||||
return ucd.getCombiningClass(cp) == propMask;
|
||||
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
|
||||
return ucd.getBidiClass(cp) == propMask;
|
||||
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
|
||||
return ucd.getDecompositionType(cp) == propMask;
|
||||
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
|
||||
return ucd.getNumericType(cp) == propMask;
|
||||
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
|
||||
return ucd.getEastAsianWidth(cp) == propMask;
|
||||
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
|
||||
return ucd.getLineBreak(cp) == propMask;
|
||||
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
|
||||
return ucd.getJoiningType(cp) == propMask;
|
||||
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
|
||||
return ucd.getJoiningGroup(cp) == propMask;
|
||||
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
|
||||
return ucd.getBinaryProperty(cp, propMask);
|
||||
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
|
||||
return ucd.getScript(cp) == propMask;
|
||||
case AGE>>8: if (propMask >= LIMIT_AGE) break;
|
||||
return ucd.getAge(cp) == propMask;
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
|
||||
}
|
||||
|
||||
static final int SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2;
|
||||
|
||||
public String getUnifiedBinaryPropertyID(int unifiedPropMask) {
|
||||
return getUnifiedBinaryPropertyID(ucdData, unifiedPropMask, NORMAL);
|
||||
}
|
||||
|
||||
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask) {
|
||||
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
|
||||
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
|
||||
if (longOne.equals(shortOne)) return longOne;
|
||||
return shortOne + "(" + longOne + ")";
|
||||
}
|
||||
|
||||
public static String getFullUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
|
||||
String pre = "";
|
||||
if ((unifiedPropMask & 0xFF00) != BINARY_PROPERTIES) {
|
||||
String preShort = UCD_Names.ABB_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
|
||||
String preLong = UCD_Names.SHORT_UNIFIED_PROPERTIES[unifiedPropMask>>8] + "=";
|
||||
if (style < LONG) pre = preShort;
|
||||
else if (style == LONG || preShort.equals(preLong)) pre = preLong;
|
||||
else pre = preShort + "(" + preLong + ")";
|
||||
}
|
||||
String shortOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, SHORT);
|
||||
if (shortOne.length() == 0) shortOne = "xx";
|
||||
String longOne = getUnifiedBinaryPropertyID(ucd, unifiedPropMask, LONG);
|
||||
if (longOne.length() == 0) longOne = "none";
|
||||
|
||||
String post;
|
||||
if (style < LONG) post = shortOne;
|
||||
else if (style == LONG || shortOne.equals(longOne)) post = longOne;
|
||||
else post = shortOne + "(" + longOne + ")";
|
||||
|
||||
if (pre.length() == 0) {
|
||||
pre = post + "=";
|
||||
post = "T";
|
||||
}
|
||||
|
||||
return pre + post;
|
||||
}
|
||||
|
||||
public static String getUnifiedBinaryPropertyID(UCD ucd, int unifiedPropMask, int style) {
|
||||
int enum = unifiedPropMask >> 8;
|
||||
byte propMask = (byte)unifiedPropMask;
|
||||
switch (enum) {
|
||||
case CATEGORY>>8: if (propMask >= LIMIT_CATEGORY) break;
|
||||
if (style != LONG) return ucd.getCategoryID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_GC[propMask];
|
||||
case COMBINING_CLASS>>8: if (propMask >= LIMIT_COMBINING_CLASS) break;
|
||||
String s = "";
|
||||
if (style == LONG) {
|
||||
s = getCombiningName(unifiedPropMask);
|
||||
if (s.length() != 0) return s;
|
||||
s = "fixed_";
|
||||
}
|
||||
return s + ucd.getCombiningClassID_fromIndex((short)(0xFF & propMask));
|
||||
case BIDI_CLASS>>8: if (propMask >= LIMIT_BIDI_CLASS) break;
|
||||
if (style != LONG) return ucd.getBidiClassID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_BC[propMask];
|
||||
case DECOMPOSITION_TYPE>>8: if (propMask >= LIMIT_DECOMPOSITION_TYPE) break;
|
||||
if (style != SHORT) return ucd.getDecompositionTypeID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_DT[propMask];
|
||||
case NUMERIC_TYPE>>8: if (propMask >= LIMIT_NUMERIC_TYPE) break;
|
||||
if (style != SHORT) return ucd.getNumericTypeID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_NT[propMask];
|
||||
case EAST_ASIAN_WIDTH>>8: if (propMask >= LIMIT_EAST_ASIAN_WIDTH) break;
|
||||
if (style != LONG) return ucd.getEastAsianWidthID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_EA[propMask];
|
||||
case LINE_BREAK>>8: if (propMask >= LIMIT_LINE_BREAK) break;
|
||||
if (style != LONG) return ucd.getLineBreakID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_LB[propMask];
|
||||
case JOINING_TYPE>>8: if (propMask >= LIMIT_JOINING_TYPE) break;
|
||||
if (style != LONG) return ucd.getJoiningTypeID_fromIndex(propMask);
|
||||
return UCD_Names.LONG_JOINING_TYPE[propMask];
|
||||
case JOINING_GROUP>>8: if (propMask >= LIMIT_JOINING_GROUP) break;
|
||||
return ucd.getJoiningGroupID_fromIndex(propMask);
|
||||
case BINARY_PROPERTIES>>8: if (propMask >= LIMIT_BINARY_PROPERTIES) break;
|
||||
if (style != SHORT) return ucd.getBinaryPropertiesID_fromIndex(propMask);
|
||||
return UCD_Names.SHORT_BP[propMask];
|
||||
case SCRIPT>>8: if (propMask >= LIMIT_SCRIPT) break;
|
||||
if (style != SHORT) return ucd.getScriptID_fromIndex(propMask);
|
||||
return UCD_Names.ABB_SCRIPT[propMask];
|
||||
case AGE>>8: if (propMask >= LIMIT_AGE) break;
|
||||
return ucd.getAgeID_fromIndex(propMask);
|
||||
}
|
||||
throw new ChainException("Illegal property Number {0}", new Object[]{new Integer(propMask)});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2001/09/06 01:29:48 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -208,12 +208,27 @@ public final class Normalizer implements UCD_Types {
|
|||
* pair is firstChar << 16 | secondChar.
|
||||
* Will need to be fixed for surrogates.
|
||||
*/
|
||||
/*
|
||||
public IntHashtable.IntEnumeration getComposition() {
|
||||
return data.getComposition();
|
||||
}
|
||||
|
||||
*/
|
||||
public void getCompositionStatus(BitSet leading, BitSet trailing, BitSet resulting) {
|
||||
Iterator it = data.compTable.keySet().iterator();
|
||||
while (it.hasNext()) {
|
||||
Long key = (Long)it.next();
|
||||
Integer result = (Integer)data.compTable.get(key);
|
||||
long keyLong = key.longValue();
|
||||
if (leading != null) leading.set((int)(keyLong >>> 32));
|
||||
if (trailing != null) trailing.set((int)keyLong);
|
||||
if (resulting != null) resulting.set(result.intValue());
|
||||
}
|
||||
for (int i = UCD.LBase; i < UCD.TLimit; ++i) {
|
||||
if (leading != null && UCD.isLeadingJamo(i)) leading.set(i); // set all initial Jamo (that form syllables)
|
||||
if (trailing != null && UCD.isTrailingJamo(i)) trailing.set(i); // set all final Jamo (that form syllables)
|
||||
}
|
||||
if (leading != null) {
|
||||
for (int i = UCD.SBase; i < UCD.SLimit; ++i) {
|
||||
if (UCD.isDoubleHangul(i)) leading.set(i); // set all two-Jamo syllables
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isTrailing(int cp) {
|
||||
return this.composition ? data.isTrailing(cp) : false;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -30,7 +30,7 @@ public class TestData implements UCD_Types {
|
|||
checkHoffman("\u05B8\u05B9\u05B1\u0591\u05C3\u05B0\u05AC\u059F");
|
||||
checkHoffman("\u0592\u05B7\u05BC\u05A5\u05B0\u05C0\u05C4\u05AD");
|
||||
|
||||
int mask = 0;
|
||||
long mask = 0;
|
||||
|
||||
if (false) {
|
||||
|
||||
|
@ -166,7 +166,7 @@ public class TestData implements UCD_Types {
|
|||
output.println();
|
||||
}
|
||||
|
||||
public static void generateDerived (int bitMask, int headerChoice, String fileName) throws IOException {
|
||||
public static void generateDerived (long bitMask, int headerChoice, String fileName) throws IOException {
|
||||
ucd = UCD.make("3.1.0");
|
||||
PrintWriter output = Utility.openPrintWriter(fileName);
|
||||
doHeader(fileName, output, headerChoice);
|
||||
|
@ -251,9 +251,11 @@ public class TestData implements UCD_Types {
|
|||
|
||||
PrintWriter output = Utility.openPrintWriter(file);
|
||||
doHeader(file, output, headerChoice);
|
||||
UnifiedBinaryProperty ubp = new UnifiedBinaryProperty(ucd);
|
||||
|
||||
int last = -1;
|
||||
for (int i = startEnum; i < endEnum; ++i) {
|
||||
if (!MyPropertyLister.isUnifiedBinaryPropertyDefined(ucd, i)) continue;
|
||||
if (!ubp.isDefined(i)) continue;
|
||||
if (i == DECOMPOSITION_TYPE || i == NUMERIC_TYPE
|
||||
|| i == (CATEGORY | UNUSED_CATEGORY)
|
||||
|| i == (BINARY_PROPERTIES | Non_break)
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -296,24 +296,38 @@ public final class UCD implements UCD_Types {
|
|||
}
|
||||
throw new IllegalArgumentException("getCase: " + caseType + ", " + simpleVsFull);
|
||||
}
|
||||
|
||||
|
||||
static final char SHY = '\u00AD';
|
||||
|
||||
static final char APOSTROPHE = '\u2019';
|
||||
|
||||
public String getCase(String s, byte simpleVsFull, byte caseType, String condition) {
|
||||
if (UTF32.length32(s) == 1) return getCase(UTF32.char32At(s, 0), simpleVsFull, caseType);
|
||||
StringBuffer result = new StringBuffer();
|
||||
int cp;
|
||||
byte currentCaseType = caseType;
|
||||
DerivedProperty dp = new DerivedProperty(this);
|
||||
|
||||
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
|
||||
cp = UTF32.char32At(s, i);
|
||||
String mappedVersion = getCase(cp, simpleVsFull, currentCaseType, condition);
|
||||
result.append(mappedVersion);
|
||||
if (caseType == TITLE) {
|
||||
// if letter is cased, change to lowercase, otherwise change to TITLE
|
||||
if (caseType == TITLE) { // set the case type for the next character
|
||||
|
||||
// certain characters are ignored
|
||||
if (cp == '-' || cp == SHY || cp == '\'' || cp == APOSTROPHE) continue;
|
||||
byte cat = getCategory(cp);
|
||||
if (cat == Mn || cat == Me || cat == Mc) {
|
||||
// ignore!
|
||||
} else if (cat == Lu || cat == Ll || cat == Lt
|
||||
|| getBinaryProperty(cp, Other_Lowercase)
|
||||
|| getBinaryProperty(cp, Other_Uppercase)) {
|
||||
if (cat == Mn || cat == Me || cat == Cf || cat == Lm) continue;
|
||||
if (dp.hasProperty(cp, DerivedProperty.DefaultIgnorable)) continue;
|
||||
// if DefaultIgnorable is not supported, then
|
||||
// check for (Cf + Cc + Cs) - White_Space
|
||||
// if (cat == Cs && cp != 0x85 && (cp < 9 || cp > 0xD)) continue;
|
||||
|
||||
// if letter is cased, change next to lowercase, otherwise revert to TITLE
|
||||
if (cat == Lu || cat == Ll || cat == Lt
|
||||
|| getBinaryProperty(cp, Other_Lowercase) // skip if not supported
|
||||
|| getBinaryProperty(cp, Other_Uppercase) // skip if not supported
|
||||
) {
|
||||
currentCaseType = LOWER;
|
||||
} else {
|
||||
currentCaseType = TITLE;
|
||||
|
@ -528,6 +542,43 @@ public final class UCD implements UCD_Types {
|
|||
public static String getCategoryID_fromIndex(byte prop) {
|
||||
return UCD_Names.GC[prop];
|
||||
}
|
||||
|
||||
public String getCombiningID(int codePoint, byte style) {
|
||||
return getCombiningID_fromIndex(getCombiningClass(codePoint), style);
|
||||
}
|
||||
|
||||
static String getCombiningID_fromIndex (short index, byte style) {
|
||||
String s = "Fixed";
|
||||
switch (index) {
|
||||
case 0: s = style < LONG ? "NR" : "NotReordered"; break;
|
||||
case 1: s = style < LONG ? "OV" : "Overlay"; break;
|
||||
case 7: s = style < LONG ? "NK" : "Nukta"; break;
|
||||
case 8: s = style < LONG ? "KV" : "KanaVoicing"; break;
|
||||
case 9: s = style < LONG ? "VR" : "Virama"; break;
|
||||
case 202: s = style < LONG ? "ATBL" : "AttachedBelowLeft"; break;
|
||||
case 204: s = style < LONG ? "ATB" : "AttachedBelow"; break;
|
||||
case 206: s = style < LONG ? "ATBR" : "AttachedBelowRight"; break;
|
||||
case 208: s = style < LONG ? "ATL" : "AttachedLeft"; break;
|
||||
case 210: s = style < LONG ? "ATR" : "AttachedRight"; break;
|
||||
case 212: s = style < LONG ? "ATAL" : "AttachedAboveLeft"; break;
|
||||
case 214: s = style < LONG ? "ATA" : "AttachedAbove"; break;
|
||||
case 216: s = style < LONG ? "ATAR" : "AttachedAboveRight"; break;
|
||||
case 218: s = style < LONG ? "BL" : "BelowLeft"; break;
|
||||
case 220: s = style < LONG ? "B" : "Below"; break;
|
||||
case 222: s = style < LONG ? "BR" : "BelowRight"; break;
|
||||
case 224: s = style < LONG ? "L" : "Left"; break;
|
||||
case 226: s = style < LONG ? "R" : "Right"; break;
|
||||
case 228: s = style < LONG ? "AL" : "AboveLeft"; break;
|
||||
case 230: s = style < LONG ? "A" : "Above"; break;
|
||||
case 232: s = style < LONG ? "AR" : "AboveRight"; break;
|
||||
case 233: s = style < LONG ? "DB" : "DoubleBelow"; break;
|
||||
case 234: s = style < LONG ? "DB" : "DoubleAbove"; break;
|
||||
case 240: s = style < LONG ? "IS" : "IotaSubscript"; break;
|
||||
default: s += "_" + index;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
public String getBidiClassID(int codePoint) {
|
||||
return getBidiClassID_fromIndex(getBidiClass(codePoint));
|
||||
|
@ -868,7 +919,7 @@ to guarantee identifier closure.
|
|||
|
||||
// Hangul constants
|
||||
|
||||
static final int
|
||||
public static final int
|
||||
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
|
||||
LCount = 19, VCount = 21, TCount = 28,
|
||||
NCount = VCount * TCount, // 588
|
||||
|
@ -891,6 +942,14 @@ to guarantee identifier closure.
|
|||
}
|
||||
|
||||
private static final char[] pair = new char[2];
|
||||
|
||||
static boolean isDoubleHangul(int s) {
|
||||
int SIndex = s - SBase;
|
||||
if (0 > SIndex || SIndex >= SCount) {
|
||||
throw new IllegalArgumentException("Not a Hangul Syllable: " + s);
|
||||
}
|
||||
return (SIndex % TCount) == 0;
|
||||
}
|
||||
|
||||
static String getHangulDecompositionPair(int ch) {
|
||||
int SIndex = ch - SBase;
|
||||
|
@ -923,6 +982,10 @@ to guarantee identifier closure.
|
|||
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
|
||||
}
|
||||
|
||||
static boolean isLeadingJamo(int cp) {
|
||||
return (LBase <= cp && cp < LLimit);
|
||||
}
|
||||
|
||||
private void fillFromFile(String version) {
|
||||
DataInputStream dataIn = null;
|
||||
String fileName = BIN_DIR + "UCD_Data" + version + ".bin";
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2001/09/01 00:06:15 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -17,6 +17,22 @@ import com.ibm.text.utility.*;
|
|||
|
||||
|
||||
final class UCD_Names implements UCD_Types {
|
||||
|
||||
public static String[][] NON_ENUMERATED = {
|
||||
{"na", "Name"},
|
||||
{"dm", "Decomposition_Mapping"},
|
||||
{"nv", "Numeric_Value"},
|
||||
{"bmg", "Bidi_Mirroring_Glyph"},
|
||||
{"lc", "Lowercase_Mapping"},
|
||||
{"uc", "Uppercase_Mapping"},
|
||||
{"tc", "Titlecase_Mapping"},
|
||||
{"cf", "Case_Folding"},
|
||||
{"slc", "Simple_Lowercase_Mapping"},
|
||||
{"suc", "Simple_Uppercase_Mapping"},
|
||||
{"stc", "Simple_Titlecase_Mapping"},
|
||||
{"sfc", "Simple_Case_Folding"},
|
||||
{"scc", "Special_Case_Condition"}
|
||||
};
|
||||
|
||||
static final String[] UNIFIED_PROPERTIES = {
|
||||
"General Category (listing UnicodeData.txt, field 2: see UnicodeData.html)",
|
||||
|
@ -32,7 +48,8 @@ final class UCD_Names implements UCD_Types {
|
|||
"Joining Group (listing ArabicShaping.txt, field 2)",
|
||||
"BidiMirrored (listing UnicodeData.txt, field 9: see UnicodeData.html)",
|
||||
"Script",
|
||||
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)"
|
||||
"Age (from a comparison of UCD versions 1.1 [minus Hangul], 2.0, 2.1, 3.0, 3.1)",
|
||||
"Derived"
|
||||
};
|
||||
|
||||
static final String[] SHORT_UNIFIED_PROPERTIES = {
|
||||
|
@ -45,9 +62,10 @@ final class UCD_Names implements UCD_Types {
|
|||
"LineBreak",
|
||||
"JoiningType",
|
||||
"JoiningGroup",
|
||||
"Value",
|
||||
"",
|
||||
"Script",
|
||||
"Age"
|
||||
"Age",
|
||||
""
|
||||
};
|
||||
|
||||
static final String[] ABB_UNIFIED_PROPERTIES = {
|
||||
|
@ -60,15 +78,16 @@ final class UCD_Names implements UCD_Types {
|
|||
"lb",
|
||||
"jt",
|
||||
"jg",
|
||||
"va",
|
||||
"",
|
||||
"sc",
|
||||
"Ag"
|
||||
"ag",
|
||||
"",
|
||||
};
|
||||
|
||||
|
||||
static final String[] BP = {
|
||||
"BidiMirrored",
|
||||
"CompositionExclusion",
|
||||
"Bidi_Mirrored",
|
||||
"Composition_Exclusion",
|
||||
"White_Space",
|
||||
"NonBreak",
|
||||
"Bidi_Control",
|
||||
|
@ -87,46 +106,46 @@ final class UCD_Names implements UCD_Types {
|
|||
"Other_Lowercase",
|
||||
"Other_Uppercase",
|
||||
"Noncharacter_Code_Point",
|
||||
"CaseFoldTurkishI",
|
||||
"Other_GraphemeExtend",
|
||||
"GraphemeLink",
|
||||
"IDS_BinaryOperator",
|
||||
"IDS_TrinaryOperator",
|
||||
"Case_Fold_Turkish_I",
|
||||
"Other_Grapheme_Extend",
|
||||
"Grapheme_Link",
|
||||
"IDS_Binary_Operator",
|
||||
"IDS_Trinary_Operator",
|
||||
"Radical",
|
||||
"UnifiedIdeograph",
|
||||
"Unified_Ideograph",
|
||||
"Other_Default_Ignorable_Code_Point",
|
||||
"Deprecated",
|
||||
};
|
||||
|
||||
static final String[] SHORT_BP = {
|
||||
"BidiM",
|
||||
"CExc",
|
||||
"WhSp",
|
||||
"CE",
|
||||
"WSpace",
|
||||
"NBrk",
|
||||
"BdCon",
|
||||
"JCon",
|
||||
"BidiC",
|
||||
"JoinC",
|
||||
"Dash",
|
||||
"Hyph",
|
||||
"Hyphen",
|
||||
"QMark",
|
||||
"TPunc",
|
||||
"Term",
|
||||
"OMath",
|
||||
"HexD",
|
||||
"AHexD",
|
||||
"OAlph",
|
||||
"Hex",
|
||||
"AHex",
|
||||
"OAlpha",
|
||||
"Ideo",
|
||||
"Diac",
|
||||
"Dia",
|
||||
"Ext",
|
||||
"OLoc",
|
||||
"OUpc",
|
||||
"OLower",
|
||||
"OUpper",
|
||||
"NChar",
|
||||
"TurkI",
|
||||
"OGrX",
|
||||
"OGrExt",
|
||||
"GrLink",
|
||||
"IDSB",
|
||||
"IDST",
|
||||
"Radical",
|
||||
"UCJK",
|
||||
"RCf",
|
||||
"UIdeo",
|
||||
"ODI",
|
||||
"Dep",
|
||||
};
|
||||
|
||||
|
@ -196,7 +215,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"Numeric", "Alphabetic", "Ideographic", "Inseperable", "Hyphen",
|
||||
"CombiningMark", "BreakBefore", "BreakAfter", "Space",
|
||||
"MandatoryBreak", "CarriageReturn", "LineFeed", "ContingentBreak",
|
||||
"ComplexContext", "Ambiguous", "BreakBeforeAndAfter", "Surrogate", "ZWSpace"
|
||||
"ComplexContext", "Ambiguous", "BreakBoth", "Surrogate", "ZWSpace"
|
||||
};
|
||||
|
||||
public static final String[] SCRIPT = {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -37,6 +37,10 @@ public interface UCD_Types {
|
|||
13 Lower case equivalent mapping. Similar to 12. This field is informative.
|
||||
14 Title case equivalent mapping. Similar to 12. This field is informative.
|
||||
*/
|
||||
|
||||
|
||||
// for IDs
|
||||
static final byte SHORT = -1, NORMAL = 0, LONG = 1, BOTH = 2;
|
||||
|
||||
// Binary ENUM Grouping
|
||||
public static final int
|
||||
|
@ -52,8 +56,9 @@ public interface UCD_Types {
|
|||
BINARY_PROPERTIES = 0x900,
|
||||
SCRIPT = 0xA00,
|
||||
AGE = 0xB00,
|
||||
DERIVED = 0xC00,
|
||||
NEXT_ENUM = 0x100,
|
||||
LIMIT_ENUM = AGE + 0x100;
|
||||
LIMIT_ENUM = DERIVED + 0x100;
|
||||
|
||||
public static final int LIMIT_COMBINING_CLASS = 256;
|
||||
|
||||
|
@ -384,4 +389,65 @@ public static byte
|
|||
YUDH_HE = 48,
|
||||
ZAIN = 49,
|
||||
LIMIT_JOINING_GROUP = 50;
|
||||
|
||||
// DERIVED PROPERTY
|
||||
|
||||
static final int
|
||||
PropMath = 0,
|
||||
PropAlphabetic = 1,
|
||||
PropLowercase = 2,
|
||||
PropUppercase = 3,
|
||||
|
||||
ID_Start = 4,
|
||||
ID_Continue_NO_Cf = 5,
|
||||
|
||||
Mod_ID_Start = 6,
|
||||
Mod_ID_Continue_NO_Cf = 7,
|
||||
|
||||
Missing_Uppercase = 8,
|
||||
Missing_Lowercase = 9,
|
||||
Missing_Mixedcase = 10,
|
||||
|
||||
FC_NFKC_Closure = 11,
|
||||
|
||||
FullCompExclusion = 12,
|
||||
FullCompInclusion = 13,
|
||||
|
||||
QuickNFD = 14,
|
||||
QuickNFC = 15,
|
||||
QuickNFKD = 16,
|
||||
QuickNFKC = 17,
|
||||
|
||||
ExpandsOnNFD = 18,
|
||||
ExpandsOnNFC = 19,
|
||||
ExpandsOnNFKD = 20,
|
||||
ExpandsOnNFKC = 21,
|
||||
|
||||
GenNFD = 22,
|
||||
GenNFC = 23,
|
||||
GenNFKD = 24,
|
||||
GenNFKC = 25,
|
||||
|
||||
DefaultIgnorable = 26,
|
||||
GraphemeExtend = 27,
|
||||
GraphemeBase = 28,
|
||||
|
||||
FC_NFC_Closure = 29,
|
||||
|
||||
Other_Case_Ignorable = 30,
|
||||
Case_Ignorable = 31,
|
||||
Type_i = 32,
|
||||
|
||||
NFC_Leading = 33,
|
||||
NFC_TrailingNonZero = 34,
|
||||
NFC_TrailingZero = 35,
|
||||
NFC_Resulting = 36,
|
||||
|
||||
NFD_UnsafeStart = 37,
|
||||
NFC_UnsafeStart = 38,
|
||||
NFKD_UnsafeStart = 39,
|
||||
NFKC_UnsafeStart = 40,
|
||||
|
||||
LIMIT = 41;
|
||||
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UData.java,v $
|
||||
* $Date: 2001/08/31 00:29:50 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -185,7 +185,7 @@ class UData implements UCD_Types {
|
|||
result.append(" n='").append(Utility.quoteXML(name)).append("'\r\n");
|
||||
|
||||
int lastPos = result.length();
|
||||
|
||||
|
||||
if (full || generalCategory != Lo) result.append(" gc='").append(UCD_Names.GC[generalCategory]).append('\'');
|
||||
if (full || combiningClass != 0) result.append(" cc='").append(combiningClass & 0xFF).append('\'');
|
||||
if (full || decompositionType != NONE) result.append(" dt='").append(UCD_Names.DT[decompositionType]).append('\'');
|
||||
|
@ -232,7 +232,7 @@ class UData implements UCD_Types {
|
|||
result.append("/>");
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
|
||||
public void writeBytes(DataOutputStream os) throws IOException {
|
||||
compact();
|
||||
os.writeInt(codePoint);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/VerifyUCD.java,v $
|
||||
* $Date: 2001/09/19 23:33:15 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -82,6 +82,16 @@ public class VerifyUCD implements UCD_Types {
|
|||
Utility.fixDot();
|
||||
System.out.println("checkCase");
|
||||
ucd = UCD.make(Main.ucdVersion);
|
||||
|
||||
String test = "The qui'ck br\u2019own 'fox jum\u00ADped ov\u200Ber th\u200Ce lazy dog.";
|
||||
|
||||
String ttest = ucd.getCase(test, FULL, TITLE);
|
||||
|
||||
PrintWriter titleTest = Utility.openPrintWriter("TestTitle.txt");
|
||||
titleTest.println(test);
|
||||
titleTest.println(ttest);
|
||||
titleTest.close();
|
||||
|
||||
initNormalizers();
|
||||
System.out.println(ucd.getCase("ABC,DE'F G\u0308H", FULL, TITLE));
|
||||
String fileName = "CaseDifferences.txt";
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/WriteJavaScriptInfo.java,v $
|
||||
* $Date: 2001/08/31 00:29:50 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -18,7 +18,34 @@ import java.io.*;
|
|||
//import java.text.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
public class WriteJavaScriptInfo {
|
||||
public class WriteJavaScriptInfo implements UCD_Types {
|
||||
|
||||
static public void assigned() throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter("assigned.js");
|
||||
UCD ucd = UCD.make();
|
||||
boolean wasIn = false;
|
||||
int lastWritten = -100;
|
||||
int i;
|
||||
for (i = 0; i <= 0x10FFFF; ++i) {
|
||||
byte cat = ucd.getCategory(i);
|
||||
boolean in = cat != Cn && cat != Co && cat != Cs;
|
||||
if (wasIn == in) continue;
|
||||
if (in) {
|
||||
log.print(i + ",");
|
||||
lastWritten = i;
|
||||
} else {
|
||||
if (lastWritten != i-1) log.print(i-1);
|
||||
log.println(",");
|
||||
}
|
||||
wasIn = in;
|
||||
}
|
||||
if (wasIn) {
|
||||
if (lastWritten != i-1) log.print(i-1);
|
||||
log.println(",");
|
||||
}
|
||||
log.close();
|
||||
}
|
||||
|
||||
/* TODO: fix enumeration of compositions
|
||||
|
||||
static public void writeJavascriptInfo() throws IOException {
|
||||
|
|
Loading…
Add table
Reference in a new issue