mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-0 Added Unicode property NF_SKIPPABLE
X-SVN-Rev: 7274
This commit is contained in:
parent
da4610d484
commit
70433b182b
12 changed files with 578 additions and 98 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
|
||||
* $Date: 2001/10/25 20:35:42 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -90,6 +90,7 @@ public class GenOverlap implements UCD_Types {
|
|||
addString(s, currCEList);
|
||||
}
|
||||
|
||||
/*
|
||||
for (int cp = 0x10000; cp <= 0x10FFFF; ++cp) {
|
||||
if (!ucd.isRepresented(cp)) continue;
|
||||
byte decompType = ucd.getDecompositionType(cp);
|
||||
|
@ -100,6 +101,7 @@ public class GenOverlap implements UCD_Types {
|
|||
System.out.println("Adding: " + ucd.getCodeAndName(cp) + "\t" + celist);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
Utility.fixDot();
|
||||
System.out.println("# Completes Count: " + completes.size());
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
|
||||
* $Date: 2001/10/31 00:01:28 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -74,7 +74,7 @@ final public class UCA implements Comparator {
|
|||
* Version of the UCA tables to use
|
||||
*/
|
||||
//private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7";
|
||||
public static final String VERSION = ""; // ""; // "-2.1.9d7";
|
||||
public static final String VERSION = "-3.1.1d1"; // ""; // "-2.1.9d7";
|
||||
public static final String ALLFILES = "allkeys"; // null if not there
|
||||
|
||||
/**
|
||||
|
@ -1019,6 +1019,7 @@ final public class UCA implements Comparator {
|
|||
// of the build process.
|
||||
String probe = String.valueOf(ch);
|
||||
Object value = contractingTable.get(probe);
|
||||
if (value == null) throw new IllegalArgumentException("Missing value for " + Utility.hex(ch));
|
||||
|
||||
// We loop, trying to add successive characters to the longest substring.
|
||||
while (index < decompositionBuffer.length()) {
|
||||
|
@ -1304,7 +1305,7 @@ final public class UCA implements Comparator {
|
|||
IntStack tempStack = new IntStack(100); // used for reversal
|
||||
StringBuffer multiChars = new StringBuffer(); // used for contracting chars
|
||||
String inputLine = "";
|
||||
while (true) { // try {
|
||||
while (true) try {
|
||||
inputLine = in.readLine();
|
||||
if (inputLine == null) break; // means file is done
|
||||
String line = cleanLine(inputLine); // remove comments, extra whitespace
|
||||
|
@ -1326,14 +1327,17 @@ final public class UCA implements Comparator {
|
|||
|
||||
// collect characters
|
||||
char value = getChar(line, position);
|
||||
fixSurrogateContraction(value);
|
||||
char value2 = getChar(line, position);
|
||||
multiChars.setLength(0); // clear buffer
|
||||
if (value2 != NOT_A_CHAR) {
|
||||
fixSurrogateContraction(value2);
|
||||
multiChars.append(value); // append until we get terminator
|
||||
multiChars.append(value2);
|
||||
while (true) {
|
||||
value2 = getChar(line, position);
|
||||
if (value2 == NOT_A_CHAR) break;
|
||||
fixSurrogateContraction(value2);
|
||||
multiChars.append(value2);
|
||||
}
|
||||
}
|
||||
|
@ -1410,9 +1414,21 @@ final public class UCA implements Comparator {
|
|||
//} catch (Exception e) {
|
||||
// throw new IllegalArgumentException("Malformed line: " + inputLine + "\n "
|
||||
// + e.getClass().getName() + ": " + e.getMessage());
|
||||
} catch (RuntimeException e) {
|
||||
System.out.println("Error on line: " + inputLine);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private void fixSurrogateContraction(char ch) {
|
||||
//if (DEBUGCHAR) System.out.println(Utility.hex(ch) + ": " + line.substring(0, position[0]) + "|" + line.substring(position[0]));
|
||||
if (ch == NOT_A_CHAR || !UTF16.isLeadSurrogate(ch)) return;
|
||||
String chs = String.valueOf(ch);
|
||||
Object probe = contractingTable.get(chs);
|
||||
if (probe != null) return;
|
||||
contractingTable.put(chs, new Integer(0));
|
||||
}
|
||||
|
||||
private void concat(int[] ces1, int[] ces2) {
|
||||
|
||||
}
|
||||
|
@ -1479,6 +1495,7 @@ final public class UCA implements Comparator {
|
|||
Enumeration enum = contractingTable.keys();
|
||||
while (enum.hasMoreElements()) {
|
||||
String sequence = (String)enum.nextElement();
|
||||
//System.out.println("Contraction: " + Utility.hex(sequence));
|
||||
for (int i = sequence.length()-1; i > 0; --i) {
|
||||
String shorter = sequence.substring(0,i);
|
||||
Object probe = contractingTable.get(shorter);
|
||||
|
@ -1550,9 +1567,18 @@ final public class UCA implements Comparator {
|
|||
* On output, updated to point to the next place to search.
|
||||
*@return the character, or NOT_A_CHAR when done
|
||||
*/
|
||||
|
||||
// NOTE in case of surrogates, we buffer up the second character!!
|
||||
char charBuffer = 0;
|
||||
|
||||
private char getChar(String line, int[] position) {
|
||||
int start = position[0];
|
||||
char ch;
|
||||
if (charBuffer != 0) {
|
||||
ch = charBuffer;
|
||||
charBuffer = 0;
|
||||
return ch;
|
||||
}
|
||||
int start = position[0];
|
||||
while (true) { // trim whitespace
|
||||
if (start >= line.length()) return NOT_A_CHAR;
|
||||
ch = line.charAt(start);
|
||||
|
@ -1560,13 +1586,25 @@ final public class UCA implements Comparator {
|
|||
start++;
|
||||
}
|
||||
// from above, we have at least one char
|
||||
if ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F')) {
|
||||
position[0] = start + 4;
|
||||
return (char)Integer.parseInt(line.substring(start,start+4),16);
|
||||
int hexLimit = start;
|
||||
while ((ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F')) {
|
||||
hexLimit++;
|
||||
ch = line.charAt(hexLimit);
|
||||
}
|
||||
if (hexLimit >= start + 4) {
|
||||
position[0] = hexLimit;
|
||||
int cp = Integer.parseInt(line.substring(start,hexLimit),16);
|
||||
if (cp <= 0xFFFF) return (char)cp;
|
||||
//DEBUGCHAR = true;
|
||||
charBuffer = UTF16.getTrailSurrogate(cp);
|
||||
return UTF16.getLeadSurrogate(cp);
|
||||
}
|
||||
|
||||
return NOT_A_CHAR;
|
||||
}
|
||||
|
||||
boolean DEBUGCHAR = false;
|
||||
|
||||
BitSet primarySet = new BitSet();
|
||||
BitSet secondarySet = new BitSet();
|
||||
BitSet tertiarySet = new BitSet();
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/DerivedProperty.java,v $
|
||||
* $Date: 2001/10/26 23:33:08 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -22,24 +22,28 @@ public class DerivedProperty implements UCD_Types {
|
|||
|
||||
// ADD CONSTANT to UCD_TYPES
|
||||
|
||||
static public UnicodeProperty getProperty(int derivedPropertyID, UCD ucd) {
|
||||
return new DerivedProperty(ucd).dprops[derivedPropertyID];
|
||||
}
|
||||
|
||||
public DerivedProperty(UCD ucd) {
|
||||
ucdData = ucd;
|
||||
}
|
||||
|
||||
public String getHeader(int propNumber) {
|
||||
DProp dp = dprops[propNumber];
|
||||
UnicodeProperty dp = dprops[propNumber];
|
||||
if (dp != null) return dp.getHeader();
|
||||
else return "Unimplemented!!";
|
||||
}
|
||||
|
||||
public String getName(int propNumber, byte style) {
|
||||
DProp dp = dprops[propNumber];
|
||||
UnicodeProperty dp = dprops[propNumber];
|
||||
if (dp != null) return dp.getName(style);
|
||||
else return "Unimplemented!!";
|
||||
}
|
||||
|
||||
public String getProperty(int cp, int propNumber) {
|
||||
DProp dp = dprops[propNumber];
|
||||
UnicodeProperty dp = dprops[propNumber];
|
||||
if (dp != null) return dp.getProperty(cp);
|
||||
else return "Unimplemented!!";
|
||||
}
|
||||
|
@ -67,16 +71,17 @@ public class DerivedProperty implements UCD_Types {
|
|||
return dprops[propNumber].getProperty(int cp);
|
||||
}
|
||||
*/
|
||||
private DProp[] dprops = new DProp[50];
|
||||
private UnicodeProperty[] dprops = new UnicodeProperty[50];
|
||||
private Normalizer[] nf = new Normalizer[4];
|
||||
private Normalizer nfd, nfc, nfkd, nfkc;
|
||||
static final String[] NAME = {"NFD", "NFC", "NFKD", "NFKC"};
|
||||
|
||||
static final String[] CaseNames = {
|
||||
"Uppercase",
|
||||
"Lowercase",
|
||||
"Mixedcase"};
|
||||
|
||||
private abstract class DProp {
|
||||
|
||||
/*
|
||||
private abstract static class UnicodeProperty {
|
||||
boolean testStatus = false;
|
||||
byte defaultStyle = LONG;
|
||||
String name, shortName, header;
|
||||
|
@ -90,13 +95,14 @@ public class DerivedProperty implements UCD_Types {
|
|||
public boolean propertyVaries() { return false; }
|
||||
public String getProperty(int cp) { return hasProperty(cp) ? name : ""; }
|
||||
}
|
||||
*/
|
||||
|
||||
class ExDProp extends DProp {
|
||||
class ExDProp extends UnicodeProperty {
|
||||
Normalizer nfx;
|
||||
ExDProp(int i) {
|
||||
nfx = nf[i-ExpandsOnNFD];
|
||||
name = "Expands_On_" + NAME[i-ExpandsOnNFD];
|
||||
shortName = "XO_" + NAME[i-ExpandsOnNFD];
|
||||
nfx = nf[i];
|
||||
name = "Expands_On_" + nfx.getName();
|
||||
shortName = "XO_" + nfx.getName();
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters whose normalized length is not one."
|
||||
|
@ -111,16 +117,15 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
class NF_UnsafeStartProp extends DProp {
|
||||
class NF_UnsafeStartProp extends UnicodeProperty {
|
||||
Normalizer nfx;
|
||||
int prop;
|
||||
//int prop;
|
||||
|
||||
NF_UnsafeStartProp(int i) {
|
||||
testStatus = true;
|
||||
prop = i-NFD_UnsafeStart;
|
||||
nfx = nf[prop];
|
||||
name = NAME[prop] + "_UnsafeStart";
|
||||
shortName = NAME[prop] + "_SS";
|
||||
isStandard = false;
|
||||
nfx = nf[i];
|
||||
name = nfx.getName() + "_UnsafeStart";
|
||||
shortName = nfx.getName() + "_SS";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters that are cc==0, BUT which may interact with previous characters."
|
||||
|
@ -131,20 +136,20 @@ public class DerivedProperty implements UCD_Types {
|
|||
String norm = nfx.normalize(cp);
|
||||
int first = UTF16.charAt(norm, 0);
|
||||
if (ucdData.getCombiningClass(first) != 0) return true;
|
||||
if ((prop == 1 || prop == 3)
|
||||
if (nfx.isComposition()
|
||||
&& dprops[NFC_TrailingZero].hasProperty(first)) return true; // 1,3 == composing
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class NFC_Prop extends DProp {
|
||||
class NFC_Prop extends UnicodeProperty {
|
||||
BitSet bitset;
|
||||
boolean filter = false;
|
||||
boolean keepNonZero = true;
|
||||
|
||||
NFC_Prop(int i) {
|
||||
testStatus = true;
|
||||
isStandard = false;
|
||||
BitSet[] bitsets = new BitSet[3];
|
||||
switch(i) {
|
||||
case NFC_Leading: bitsets[0] = bitset = new BitSet(); break;
|
||||
|
@ -181,27 +186,27 @@ public class DerivedProperty implements UCD_Types {
|
|||
};
|
||||
};
|
||||
|
||||
class GenDProp extends DProp {
|
||||
class GenDProp extends UnicodeProperty {
|
||||
Normalizer nfx;
|
||||
Normalizer nfComp = null;
|
||||
|
||||
GenDProp (int i) {
|
||||
testStatus = true;
|
||||
nfx = nf[i-GenNFD];
|
||||
name = NAME[i-GenNFD];
|
||||
isStandard = false;
|
||||
nfx = nf[i];
|
||||
name = nfx.getName();
|
||||
String compName = "the character itself";
|
||||
|
||||
if (i == GenNFKC || i == GenNFD) {
|
||||
if (i == NFKC || i == NFD) {
|
||||
name += "-NFC";
|
||||
nfComp = nfc;
|
||||
compName = "NFC for the character";
|
||||
} else if (i == GenNFKD) {
|
||||
} else if (i == NFKD) {
|
||||
name += "-NFD";
|
||||
nfComp = nfd;
|
||||
compName = "NFD for the character";
|
||||
}
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Lists characters in normalized form " + NAME[i-GenNFD] + "."
|
||||
+ "\r\n# Lists characters in normalized form " + nfx.getName() + "."
|
||||
+ "\r\n# Only those characters whith normalized forms are DIFFERENT from " + compName + " are listed!"
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# It is NOT sufficient to replace characters one-by-one with these results!";
|
||||
|
@ -237,10 +242,10 @@ public class DerivedProperty implements UCD_Types {
|
|||
boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
|
||||
};
|
||||
|
||||
class CaseDProp extends DProp {
|
||||
class CaseDProp extends UnicodeProperty {
|
||||
byte val;
|
||||
CaseDProp (int i) {
|
||||
testStatus = true;
|
||||
isStandard = false;
|
||||
val = (i == Missing_Uppercase ? Lu : i == Missing_Lowercase ? Ll : Lt);
|
||||
name = "Possible_Missing_" + CaseNames[i-Missing_Uppercase];
|
||||
header = "# Derived Property: " + name
|
||||
|
@ -256,16 +261,16 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
class QuickDProp extends DProp {
|
||||
class QuickDProp extends UnicodeProperty {
|
||||
String NO;
|
||||
String MAYBE;
|
||||
Normalizer nfx;
|
||||
QuickDProp (int i) {
|
||||
nfx = nf[i - QuickNFD];
|
||||
NO = NAME[i-QuickNFD] + "_NO";
|
||||
MAYBE = NAME[i-QuickNFD] + "_MAYBE";
|
||||
name = NAME[i-QuickNFD] + "_QuickCheck";
|
||||
shortName = NAME[i-QuickNFD] + "_QC";
|
||||
nfx = nf[i];
|
||||
NO = nfx.getName() + "_NO";
|
||||
MAYBE = nfx.getName() + "_MAYBE";
|
||||
name = nfx.getName() + "_QuickCheck";
|
||||
shortName = nfx.getName() + "_QC";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated from computing decomposibles"
|
||||
+ ((i == QuickNFC || i == QuickNFKC)
|
||||
|
@ -288,11 +293,11 @@ public class DerivedProperty implements UCD_Types {
|
|||
nfkc = nf[3] = new Normalizer(Normalizer.NFKC);
|
||||
|
||||
for (int i = ExpandsOnNFD; i <= ExpandsOnNFKC; ++i) {
|
||||
dprops[i] = new ExDProp(i);
|
||||
dprops[i] = new ExDProp(i-ExpandsOnNFD);
|
||||
}
|
||||
|
||||
for (int i = GenNFD; i <= GenNFKC; ++i) {
|
||||
dprops[i] = new GenDProp(i);
|
||||
dprops[i] = new GenDProp(i-GenNFD);
|
||||
}
|
||||
|
||||
for (int i = NFC_Leading; i <= NFC_Resulting; ++i) {
|
||||
|
@ -300,10 +305,10 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
|
||||
for (int i = NFD_UnsafeStart; i <= NFKC_UnsafeStart; ++i) {
|
||||
dprops[i] = new NF_UnsafeStartProp(i);
|
||||
dprops[i] = new NF_UnsafeStartProp(i-NFD_UnsafeStart);
|
||||
}
|
||||
|
||||
dprops[ID_Start] = new DProp() {
|
||||
dprops[ID_Start] = new UnicodeProperty() {
|
||||
{
|
||||
name = "ID_Start";
|
||||
shortName = "IDS";
|
||||
|
@ -316,7 +321,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
dprops[ID_Continue_NO_Cf] = new DProp() {
|
||||
dprops[ID_Continue_NO_Cf] = new UnicodeProperty() {
|
||||
{
|
||||
name = "ID_Continue";
|
||||
shortName = "IDC";
|
||||
|
@ -330,7 +335,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
dprops[Mod_ID_Start] = new DProp() {
|
||||
dprops[Mod_ID_Start] = new UnicodeProperty() {
|
||||
{
|
||||
name = "XID_Start";
|
||||
shortName = "XIDS";
|
||||
|
@ -345,7 +350,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
dprops[Mod_ID_Continue_NO_Cf] = new DProp() {
|
||||
dprops[Mod_ID_Continue_NO_Cf] = new UnicodeProperty() {
|
||||
{
|
||||
name = "XID_Continue";
|
||||
shortName = "XIDC";
|
||||
|
@ -361,7 +366,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
dprops[PropMath] = new DProp() {
|
||||
dprops[PropMath] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Math";
|
||||
shortName = name;
|
||||
|
@ -376,7 +381,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
dprops[PropAlphabetic] = new DProp() {
|
||||
dprops[PropAlphabetic] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Alphabetic";
|
||||
shortName = "Alpha";
|
||||
|
@ -391,7 +396,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
dprops[PropLowercase] = new DProp() {
|
||||
dprops[PropLowercase] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Lowercase";
|
||||
shortName = "Lower";
|
||||
|
@ -406,7 +411,7 @@ public class DerivedProperty implements UCD_Types {
|
|||
}
|
||||
};
|
||||
|
||||
dprops[PropUppercase] = new DProp() {
|
||||
dprops[PropUppercase] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Uppercase";
|
||||
shortName = "Upper";
|
||||
|
@ -432,7 +437,7 @@ including all characters whose canonical decomposition consists of a single char
|
|||
file by including all characters whose canonical decomposition consists of a sequence
|
||||
of characters, the first of which has a non-zero combining class.
|
||||
*/
|
||||
dprops[FullCompExclusion] = new DProp() {
|
||||
dprops[FullCompExclusion] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Full_Composition_Exclusion";
|
||||
shortName = "Comp_Ex";
|
||||
|
@ -451,9 +456,9 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
};
|
||||
|
||||
dprops[FullCompInclusion] = new DProp() {
|
||||
dprops[FullCompInclusion] = new UnicodeProperty() {
|
||||
{
|
||||
testStatus = true;
|
||||
isStandard = false;
|
||||
name = "Full_Composition_Inclusion";
|
||||
shortName = "Comp_In";
|
||||
defaultStyle = SHORT;
|
||||
|
@ -471,7 +476,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
};
|
||||
|
||||
dprops[FC_NFKC_Closure] = new DProp() {
|
||||
dprops[FC_NFKC_Closure] = new UnicodeProperty() {
|
||||
{
|
||||
name = "FC_NFKC_Closure";
|
||||
shortName = "FC_NFKC";
|
||||
|
@ -491,7 +496,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
boolean hasProperty(int cp) { return getProperty(cp).length() != 0; }
|
||||
};
|
||||
|
||||
dprops[FC_NFC_Closure] = new DProp() {
|
||||
dprops[FC_NFC_Closure] = new UnicodeProperty() {
|
||||
{
|
||||
name = "FC_NFC_Closure";
|
||||
shortName = "FC_NFC";
|
||||
|
@ -512,10 +517,10 @@ of characters, the first of which has a non-zero combining class.
|
|||
};
|
||||
|
||||
for (int i = QuickNFD; i <= QuickNFKC; ++i) {
|
||||
dprops[i] = new QuickDProp(i);
|
||||
dprops[i] = new QuickDProp(i - QuickNFD);
|
||||
}
|
||||
|
||||
dprops[DefaultIgnorable] = new DProp() {
|
||||
dprops[DefaultIgnorable] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Default_Ignorable_Code_Point";
|
||||
shortName = "DI";
|
||||
|
@ -538,7 +543,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
# GraphemeBase :=
|
||||
|
||||
*/
|
||||
dprops[GraphemeExtend] = new DProp() {
|
||||
dprops[GraphemeExtend] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Grapheme_Extend";
|
||||
shortName = "GrExt";
|
||||
|
@ -556,7 +561,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
};
|
||||
|
||||
dprops[Other_Case_Ignorable] = new DProp() {
|
||||
dprops[Other_Case_Ignorable] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Other_Case_Ignorable";
|
||||
shortName = "OCI";
|
||||
|
@ -577,7 +582,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
};
|
||||
|
||||
dprops[Type_i] = new DProp() {
|
||||
dprops[Type_i] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Special_Dotted";
|
||||
shortName = "SDot";
|
||||
|
@ -606,7 +611,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
};
|
||||
|
||||
dprops[Case_Ignorable] = new DProp() {
|
||||
dprops[Case_Ignorable] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Case_Ignorable";
|
||||
shortName = "CI";
|
||||
|
@ -621,7 +626,7 @@ of characters, the first of which has a non-zero combining class.
|
|||
}
|
||||
};
|
||||
|
||||
dprops[GraphemeBase] = new DProp() {
|
||||
dprops[GraphemeBase] = new UnicodeProperty() {
|
||||
{
|
||||
name = "Grapheme_Base";
|
||||
shortName = "GrBase";
|
||||
|
@ -648,6 +653,9 @@ of characters, the first of which has a non-zero combining class.
|
|||
if (cat == Ll
|
||||
|| ucdData.getBinaryProperty(cp, Other_Lowercase)) return Ll;
|
||||
if (cat == Lt || cat == Lo || cat == Lm || cat == Nl) return cat;
|
||||
|
||||
if (true) throw new IllegalArgumentException("FIX nf[2]");
|
||||
|
||||
if (!nf[2].normalizationDiffers(cp)) return Lo;
|
||||
|
||||
String norm = nf[2].normalize(cp);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Main.java,v $
|
||||
* $Date: 2001/11/13 02:31:55 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -35,6 +35,7 @@ public final class Main {
|
|||
} else if (arg.equalsIgnoreCase("build")) {
|
||||
ConvertUCD.main(new String[]{ucdVersion});
|
||||
} else if (arg.equalsIgnoreCase("version")) ucdVersion = args[++i];
|
||||
else if (arg.equalsIgnoreCase("testskippable")) NFSkippable.main(null);
|
||||
else if (arg.equalsIgnoreCase("generateXML")) VerifyUCD.generateXML();
|
||||
else if (arg.equalsIgnoreCase("checkSpeed")) VerifyUCD.checkSpeed();
|
||||
else if (arg.equalsIgnoreCase("generateHanTransliterator")) GenerateHanTransliterator.main();
|
||||
|
|
284
tools/unicodetools/com/ibm/text/UCD/NFSkippable.java
Normal file
284
tools/unicodetools/com/ibm/text/UCD/NFSkippable.java
Normal file
|
@ -0,0 +1,284 @@
|
|||
package com.ibm.text.UCD;
|
||||
import com.ibm.text.UTF16;
|
||||
import com.ibm.text.UnicodeSet;
|
||||
import java.util.BitSet;
|
||||
import com.ibm.text.utility.*;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
|
||||
public final class NFSkippable extends UnicodeProperty {
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
|
||||
private Normalizer nf;
|
||||
private Normalizer nfd;
|
||||
private boolean composes;
|
||||
private int[] realTrailers = new int[100];
|
||||
private int realTrailerCount = 0;
|
||||
|
||||
public NFSkippable(byte normalizerMode, String unicodeVersion) {
|
||||
isStandard = false;
|
||||
ucd = UCD.make(unicodeVersion);
|
||||
nf = new Normalizer(normalizerMode, unicodeVersion);
|
||||
name = nf.getName() + "_Skippable";
|
||||
shortName = nf.getName() + "_Skip";
|
||||
header = "# Derived Property: " + name
|
||||
+ "\r\n# Generated according to UAX #15."
|
||||
+ "\r\n# Characters that don't interact with any others in this normalization form."
|
||||
+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
|
||||
+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";
|
||||
|
||||
nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
|
||||
composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC;
|
||||
|
||||
// preprocess to find possible trailers
|
||||
|
||||
if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) {
|
||||
if (nf.isTrailing(cp2)) {
|
||||
//System.out.println("Trailing: " + ucd.getCodeAndName(cp2));
|
||||
if (ucd.isTrailingJamo(cp2)) {
|
||||
//System.out.println("Jamo: " + ucd.getCodeAndName(cp2));
|
||||
continue;
|
||||
}
|
||||
realTrailers[realTrailerCount++] = cp2;
|
||||
}
|
||||
}
|
||||
Utility.fixDot();
|
||||
//System.out.println("trailer count: " + realTrailerCount);
|
||||
}
|
||||
|
||||
/** A skippable character is<br>
|
||||
* a) unassigned, or ALL of the following:<br>
|
||||
* b) of combining class 0.<br>
|
||||
* c) not decomposed by this normalization form.<br>
|
||||
* AND if NKC or NFKC, <br>
|
||||
* d) can never compose with a previous character.<br>
|
||||
* e) can never compose with a following character.<br>
|
||||
* f) can never change if another character is added.
|
||||
* Example: a-breve might satisfy all but f, but if you
|
||||
* add an ogonek it changes to a-ogonek + breve
|
||||
*/
|
||||
|
||||
String cause = "";
|
||||
|
||||
public boolean hasProperty(int cp) {
|
||||
// quick check on some special classes
|
||||
if (DEBUG) cause = "\t\tunassigned";
|
||||
if (!ucd.isAssigned(cp)) return true;
|
||||
|
||||
if (DEBUG) cause = "\t\tnf differs";
|
||||
if (nf.normalizationDiffers(cp)) return false;
|
||||
|
||||
if (DEBUG) cause = "\t\tnon-zero cc";
|
||||
if (ucd.getCombiningClass(cp) != 0) return false;
|
||||
|
||||
if (DEBUG) cause = "";
|
||||
if (!composes) return true;
|
||||
|
||||
// now special checks for composing normalizers
|
||||
if (DEBUG) cause = "\t\tleading";
|
||||
if (nf.isLeading(cp)) return false;
|
||||
|
||||
if (DEBUG) cause = "\t\ttrailing";
|
||||
if (nf.isTrailing(cp)) return false;
|
||||
|
||||
// OPTIMIZATION -- careful
|
||||
// If there is no NFD decomposition, then this character's accents can't be
|
||||
// "displaced", so we don't have to test further
|
||||
|
||||
if (DEBUG) cause = "\t\tno decomp";
|
||||
if (!nfd.normalizationDiffers(cp)) return true;
|
||||
|
||||
// OPTIMIZATION -- careful
|
||||
// Hangul syllables are skippable IFF they are isLeadingJamoComposition
|
||||
if (ucd.isHangulSyllable(cp)) return !ucd.isLeadingJamoComposition(cp);
|
||||
|
||||
// We now see if adding another character causes a problem.
|
||||
// brute force for now!!
|
||||
// We do skip the trailing Jamo, since those never displace!
|
||||
|
||||
StringBuffer base = new StringBuffer(UTF16.valueOf(cp));
|
||||
int baseLen = base.length();
|
||||
for (int i = 0; i < realTrailerCount; ++i) {
|
||||
base.setLength(baseLen); // shorten if needed
|
||||
base.append(UTF16.valueOf(realTrailers[i]));
|
||||
String probe = base.toString();
|
||||
String result = nf.normalize(probe);
|
||||
if (!result.equals(probe)) {
|
||||
if (DEBUG) cause = "\t\tinteracts with " + ucd.getCodeAndName(realTrailers[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// passed the sieve, so we are ok
|
||||
if (DEBUG) cause = "";
|
||||
return true;
|
||||
}
|
||||
|
||||
// both the following should go into UTF16
|
||||
|
||||
public static String replace(String source, int toReplace, int replacement) {
|
||||
if (0 <= toReplace && toReplace <= 0xFFFF
|
||||
&& 0 <= replacement && replacement <= 0xFFFF) {
|
||||
return source.replace((char)toReplace, (char)replacement);
|
||||
}
|
||||
return replace(source, UTF16.valueOf(toReplace), UTF16.valueOf(replacement));
|
||||
}
|
||||
|
||||
public static String replace(String source, String toReplace, String replacement) {
|
||||
int pos = 0;
|
||||
StringBuffer result = new StringBuffer(source.length());
|
||||
while (true) {
|
||||
int newPos = source.indexOf(toReplace, pos);
|
||||
if (newPos >= 0) {
|
||||
result.append(source.substring(pos, newPos));
|
||||
result.append(replacement);
|
||||
pos = newPos + toReplace.length();
|
||||
} else if (pos != 0) {
|
||||
result.append(source.substring(pos));
|
||||
return result.toString();
|
||||
} else {
|
||||
return source; // no change necessary
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void writeStringInPieces(PrintWriter pw, String s, String term) {
|
||||
int start;
|
||||
int end;
|
||||
int lineLen = 64;
|
||||
for (start = 0; ; start = end) {
|
||||
if (start == 0) pw.print("\t \"");
|
||||
else pw.print("\t+ \"");
|
||||
end = s.length();
|
||||
if (end > start + lineLen) end = start + lineLen;
|
||||
|
||||
// if we have a slash in the last 5 characters, backup
|
||||
|
||||
int lastSlash = s.lastIndexOf('\\', end);
|
||||
if (lastSlash >= end-5) end = lastSlash;
|
||||
|
||||
// backup if we broke on a \
|
||||
|
||||
while (end > start && s.charAt(end-1) == '\\') --end;
|
||||
|
||||
pw.print(s.substring(start, end));
|
||||
if (end == s.length()) {
|
||||
pw.println('"' + term);
|
||||
break;
|
||||
} else {
|
||||
pw.println('"');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void testWriteStringInPieces() {
|
||||
String test =
|
||||
"[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
|
||||
+ "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD\\u00F"
|
||||
+ "F-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137\\u0139-"
|
||||
+ "\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165\\u0168-\\u017"
|
||||
+ "E\\u01A0-\\u01A1\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u"
|
||||
+ "01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F\\u0226";
|
||||
PrintWriter pw = new PrintWriter(System.out);
|
||||
writeStringInPieces(pw,test,"");
|
||||
writeStringInPieces(pw,replace(test, "\\", "\\\\"),"");
|
||||
|
||||
pw.flush();
|
||||
}
|
||||
|
||||
static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller
|
||||
|
||||
public static void main (String[] args) throws java.io.IOException {
|
||||
|
||||
String version = ""; // Unicode version, "" = latest released
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt");
|
||||
|
||||
for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) {
|
||||
UnicodeProperty up = DerivedProperty.getProperty(mode, UCD.make(version));
|
||||
generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up);
|
||||
}
|
||||
|
||||
for (byte mode = NFD; mode <= NFKC; ++mode) {
|
||||
NFSkippable skipper = new NFSkippable(mode,version);
|
||||
generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
|
||||
}
|
||||
|
||||
out.close();
|
||||
}
|
||||
|
||||
static void generateSet(PrintWriter out, String label, UnicodeProperty up) {
|
||||
System.out.println("Generating: " + up.getName(NORMAL));
|
||||
UnicodeSet result = new UnicodeSet();
|
||||
for (int cp = 0; cp <= limit; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (up.hasProperty(cp)) result.add(cp);
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
String rSet = result.toPattern(true);
|
||||
rSet = replace(rSet, "\\U", "\\\\U");
|
||||
out.println(label + " = new UnicodeSet(");
|
||||
writeStringInPieces(out, rSet, ", false);");
|
||||
out.println();
|
||||
|
||||
rSet = result.toPattern(false);
|
||||
out.println("/*Unicode: ");
|
||||
writeStringInPieces(out, rSet, "*/");
|
||||
out.println();
|
||||
out.flush();
|
||||
}
|
||||
|
||||
/*
|
||||
// DerivedProperty dp = new DerivedProperty(UCD.make(version));
|
||||
|
||||
System.out.println(skipper.getName(NORMAL));
|
||||
|
||||
UnicodeSet result = new UnicodeSet();
|
||||
for (int cp = 0; cp <= limit; ++cp) {
|
||||
Utility.dot(cp);
|
||||
if (skipper.hasProperty(cp)) result.add(cp);
|
||||
}
|
||||
Utility.fixDot();
|
||||
|
||||
String rSet = result.toPattern(true);
|
||||
rSet = replace(rSet, "\\U", "\\\\U");
|
||||
out.println("\tSKIPPABLE[" + skipper.getName(NORMAL)
|
||||
+ "] = new UnicodeSet(");
|
||||
writeStringInPieces(out, rSet, ", false);");
|
||||
out.println();
|
||||
|
||||
rSet = result.toPattern(false);
|
||||
out.println("/*Unicode: ");
|
||||
*/
|
||||
//writeStringInPieces(out, rSet, "*/");
|
||||
/*out.println();
|
||||
out.flush();
|
||||
|
||||
if (false) {
|
||||
NFSkippable skipper = new NFSkippable(Normalizer.NFC,"");
|
||||
NFSkippable skipper2 = new NFSkippable(Normalizer.NFKC,"");
|
||||
for (int cp = 0; cp <= 0x10FFFF; ++cp) {
|
||||
if (cp > 0xFF) {
|
||||
if (!skipper.ucd.isAssigned(cp)) continue;
|
||||
byte cat = skipper.ucd.getCategory(cp);
|
||||
if (cat == PRIVATE_USE || cat == SURROGATE) continue;
|
||||
if (skipper.ucd.getCombiningClass(cp) != 0) continue;
|
||||
if (skipper.nf.normalizationDiffers(cp)) continue;
|
||||
if ((cp < 0xAC00 || cp > 0xAE00)
|
||||
&& cp != skipper.ucd.mapToRepresentative(cp, false)) continue;
|
||||
}
|
||||
|
||||
if (skipper2.hasProperty(cp) == skipper.hasProperty(cp)) continue;
|
||||
|
||||
String status = (skipper.hasProperty(cp) ? " SKIPc " : "NOSKIPc ")
|
||||
+ (skipper2.hasProperty(cp) ? " SKIPkc " : "NOSKIPkc ");
|
||||
System.out.println(status
|
||||
+ skipper.ucd.getCodeAndName(cp)
|
||||
+ skipper.cause);
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/Normalizer.java,v $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -40,8 +40,9 @@ public final class Normalizer implements UCD_Types {
|
|||
* Create a normalizer for a given form.
|
||||
*/
|
||||
public Normalizer(byte form, String unicodeVersion) {
|
||||
this.composition = (form & COMPOSITION_MASK) != 0;
|
||||
this.compatibility = (form & COMPATIBILITY_MASK) != 0;
|
||||
this.form = form;
|
||||
this.composition = (form & NF_COMPOSITION_MASK) != 0;
|
||||
this.compatibility = (form & NF_COMPATIBILITY_MASK) != 0;
|
||||
this.data = getData(unicodeVersion);
|
||||
}
|
||||
|
||||
|
@ -53,20 +54,32 @@ public final class Normalizer implements UCD_Types {
|
|||
}
|
||||
|
||||
/**
|
||||
* Masks for the form selector
|
||||
*/
|
||||
public static final byte
|
||||
COMPATIBILITY_MASK = 1,
|
||||
COMPOSITION_MASK = 2;
|
||||
* Return string name
|
||||
*/
|
||||
public static String getName(byte form) {
|
||||
return UCD_Names.NF_NAME[form];
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalization Form Selector
|
||||
*/
|
||||
public static final byte
|
||||
NFD = 0 ,
|
||||
NFKD = COMPATIBILITY_MASK,
|
||||
NFC = COMPOSITION_MASK,
|
||||
NFKC = (byte)(COMPATIBILITY_MASK + COMPOSITION_MASK);
|
||||
* Return string name
|
||||
*/
|
||||
public String getName() {
|
||||
return getName(form);
|
||||
}
|
||||
|
||||
/**
|
||||
* Does compose?
|
||||
*/
|
||||
public boolean isComposition() {
|
||||
return composition;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does compose?
|
||||
*/
|
||||
public boolean isCompatibility() {
|
||||
return compatibility;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalizes text according to the chosen form,
|
||||
|
@ -234,6 +247,10 @@ public final class Normalizer implements UCD_Types {
|
|||
return this.composition ? data.isTrailing(cp) : false;
|
||||
}
|
||||
|
||||
public boolean isLeading(int cp) {
|
||||
return this.composition ? data.isLeading(cp) : false;
|
||||
}
|
||||
|
||||
|
||||
// ======================================
|
||||
// PRIVATES
|
||||
|
@ -242,13 +259,14 @@ public final class Normalizer implements UCD_Types {
|
|||
/**
|
||||
* The current form.
|
||||
*/
|
||||
private byte form;
|
||||
private boolean composition;
|
||||
private boolean compatibility;
|
||||
|
||||
/**
|
||||
* Decomposes text, either canonical or compatibility,
|
||||
* replacing contents of the target buffer.
|
||||
* @param form the normalization form. If COMPATIBILITY_MASK
|
||||
* @param form the normalization form. If NF_COMPATIBILITY_MASK
|
||||
* bit is on in this byte, then selects the recursive
|
||||
* compatibility decomposition, otherwise selects
|
||||
* the recursive canonical decomposition.
|
||||
|
@ -342,6 +360,7 @@ public final class Normalizer implements UCD_Types {
|
|||
private UCD ucd;
|
||||
private HashMap compTable = new HashMap();
|
||||
private BitSet isSecond = new BitSet();
|
||||
private BitSet isFirst = new BitSet();
|
||||
private BitSet canonicalRecompose = new BitSet();
|
||||
private BitSet compatibilityRecompose = new BitSet();
|
||||
static final int NOT_COMPOSITE = 0xFFFF;
|
||||
|
@ -352,6 +371,7 @@ public final class Normalizer implements UCD_Types {
|
|||
if (!ucd.isAssigned(i)) continue;
|
||||
if (ucd.isPUA(i)) continue;
|
||||
if (ucd.isTrailingJamo(i)) isSecond.set(i);
|
||||
if (ucd.isLeadingJamoComposition(i)) isFirst.set(i);
|
||||
byte dt = ucd.getDecompositionType(i);
|
||||
if (dt != CANONICAL) continue;
|
||||
if (!ucd.getBinaryProperty(i, CompositionExclusion)) {
|
||||
|
@ -364,6 +384,7 @@ public final class Normalizer implements UCD_Types {
|
|||
}
|
||||
int a = UTF16.charAt(s, 0);
|
||||
if (ucd.getCombiningClass(a) != 0) continue;
|
||||
isFirst.set(a);
|
||||
|
||||
int b = UTF16.charAt(s, UTF16.getCharCount(a));
|
||||
isSecond.set(b);
|
||||
|
@ -429,6 +450,10 @@ Problem: differs: true, call: false U+1FED GREEK DIALYTIKA AND VARIA
|
|||
return isSecond.get(cp);
|
||||
}
|
||||
|
||||
boolean isLeading(int cp) {
|
||||
return isFirst.get(cp);
|
||||
}
|
||||
|
||||
boolean normalizationDiffers(int cp, boolean composition, boolean compatibility) {
|
||||
byte dt = ucd.getDecompositionType(cp);
|
||||
if (!composition) {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/PropertyLister.java,v $
|
||||
* $Date: 2001/09/19 23:33:16 $
|
||||
* $Revision: 1.3 $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.4 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -15,6 +15,7 @@ package com.ibm.text.UCD;
|
|||
|
||||
import java.io.*;
|
||||
import com.ibm.text.utility.*;
|
||||
import com.ibm.text.UnicodeSet;
|
||||
import java.text.NumberFormat;
|
||||
|
||||
|
||||
|
@ -32,6 +33,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
protected int firstRealCp = -2;
|
||||
protected int lastRealCp = -2;
|
||||
protected boolean alwaysBreaks = false; // set to true if property only breaks
|
||||
private UnicodeSet set = new UnicodeSet();
|
||||
|
||||
public static final byte INCLUDE = 0, BREAK = 1, CONTINUE = 2, EXCLUDE = 3;
|
||||
|
||||
|
@ -65,6 +67,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
|
||||
public void format(int startCp, int endCp, int realCount) {
|
||||
try {
|
||||
set.add(startCp, endCp);
|
||||
String prop = propertyName(startCp);
|
||||
if (prop.length() > 0) prop = "; " + prop;
|
||||
String opt = optionalName(startCp);
|
||||
|
@ -153,6 +156,7 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
}
|
||||
|
||||
public int print() {
|
||||
set.clear();
|
||||
int count = 0;
|
||||
firstRealCp = -1;
|
||||
byte firstRealCpCat = -1;
|
||||
|
@ -215,6 +219,8 @@ abstract public class PropertyLister implements UCD_Types {
|
|||
output.println();
|
||||
output.println("# Total code points: " + nf.format(count));
|
||||
output.println();
|
||||
System.out.println(headerString());
|
||||
System.out.println(set.toPattern(true));
|
||||
return count;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,50 @@
|
|||
# DRAFT
|
||||
# PropertyValueAliases-3.2.0.txt
|
||||
#
|
||||
# This file contains aliases for property values used in the UCD.
|
||||
# These names can be used for XML formats of UCD data, for regular-expression
|
||||
# property tests, and other programmatic textual descriptions of Unicode data.
|
||||
# The names are not normative, except where they correspond to normative property
|
||||
# values in the UCD. For information on which properties are normative, see
|
||||
# UnicodeCharacterDatabase.html.
|
||||
#
|
||||
# The names may be translated in appropriate environments, and additional
|
||||
# aliases may be useful.
|
||||
#
|
||||
# FORMAT
|
||||
#
|
||||
# Each line describes a property value name.
|
||||
# This consists of three fields, separated by semicolons.
|
||||
#
|
||||
# First Field: The first field describes the property for which that
|
||||
# property value name is used.
|
||||
# There is one special pseudo-property: "qc" stands for any quick-check property
|
||||
#
|
||||
# Second Field: The second field is an abbreviated name.
|
||||
# If there is no abbreviated name available, the field is marked with "n/a".
|
||||
#
|
||||
# Third Field: The third field is a long name.
|
||||
#
|
||||
# With loose matching of property names, the case distinctions, whitespace,
|
||||
# and '_' are ignored.
|
||||
#
|
||||
# NOTE: The Block property values are in Blocks.txt, and not repeated here.
|
||||
# For more information on the use of blocks, see UTR #24: Regular Expression Guidelines
|
||||
#
|
||||
# NOTE: Currently there is at most one abbreviated name and one long name for
|
||||
# property value. However, in the future additional aliases
|
||||
# may be added. In such a case, the first line for the property value
|
||||
# would have the preferred alias for output.
|
||||
#
|
||||
# NOTE: The property value names are NOT unique across properties, especially
|
||||
# with loose matches. For example,
|
||||
# AL means Arabic Letter for the Bidi_Class property, and
|
||||
# AL means Alpha_Left for the Combining_Class property, and
|
||||
# AL means Alphabetic for the Line_Break property.
|
||||
#
|
||||
# In addition, some property names may be the same as some property value names:
|
||||
# cc means Combining_Class property, and
|
||||
# cc means the General_Category property value Control (cc)
|
||||
#
|
||||
# The combination of property value and property name is, however, unique.
|
||||
# For more information, see UTR #24: Regular Expression Guidelines
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2001/10/26 23:33:07 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -977,6 +977,16 @@ to guarantee identifier closure.
|
|||
}
|
||||
return 0xFFFF; // no composition
|
||||
}
|
||||
|
||||
static boolean isHangulSyllable(int char1) {
|
||||
return SBase <= char1 && char1 < SLimit;
|
||||
}
|
||||
|
||||
static boolean isLeadingJamoComposition(int char1) {
|
||||
return (LBase <= char1 && char1 < LLimit
|
||||
|| SBase <= char1 && char1 < SLimit
|
||||
&& ((char1 - SBase) % TCount) == 0);
|
||||
}
|
||||
|
||||
static boolean isTrailingJamo(int cp) {
|
||||
return (VBase <= cp && cp < VLimit) || (TBase <= cp && cp < TLimit);
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2001/11/13 02:31:55 $
|
||||
* $Revision: 1.7 $
|
||||
* $Date: 2001/12/03 19:29:35 $
|
||||
* $Revision: 1.8 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -766,6 +766,8 @@ final class UCD_Names implements UCD_Types {
|
|||
"P", // U+11C1; P; HANGUL JONGSEONG PHIEUPH
|
||||
"H", // U+11C2; H; HANGUL JONGSEONG HIEUH
|
||||
};
|
||||
|
||||
static final String[] NF_NAME = {"NFD", "NFC", "NFKD", "NFKC"};
|
||||
|
||||
/*
|
||||
static {
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2001/10/25 20:33:46 $
|
||||
* $Revision: 1.4 $
|
||||
* $Date: 2001/12/03 19:29:34 $
|
||||
* $Revision: 1.5 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -390,6 +390,11 @@ public static byte
|
|||
ZAIN = 49,
|
||||
LIMIT_JOINING_GROUP = 50;
|
||||
|
||||
static final byte NFD = 0, NFC = 1, NFKD = 2, NFKC = 3;
|
||||
public static final int
|
||||
NF_COMPATIBILITY_MASK = 2,
|
||||
NF_COMPOSITION_MASK = 1;
|
||||
|
||||
// DERIVED PROPERTY
|
||||
|
||||
static final int
|
||||
|
@ -448,6 +453,11 @@ public static byte
|
|||
NFKD_UnsafeStart = 39,
|
||||
NFKC_UnsafeStart = 40,
|
||||
|
||||
NFD_Skippable = 41,
|
||||
NFC_Skippable = 42,
|
||||
NFKD_Skippable = 43,
|
||||
NFKC_Skippable = 44,
|
||||
|
||||
LIMIT = 41;
|
||||
|
||||
}
|
44
tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java
Normal file
44
tools/unicodetools/com/ibm/text/UCD/UnicodeProperty.java
Normal file
|
@ -0,0 +1,44 @@
|
|||
package com.ibm.text.UCD;
|
||||
public abstract class UnicodeProperty implements UCD_Types {
|
||||
|
||||
protected UCD ucd;
|
||||
protected boolean isStandard = true;
|
||||
protected byte defaultStyle = LONG;
|
||||
protected String name, shortName, header;
|
||||
|
||||
// Old Names for compatibility
|
||||
boolean isTest() { return isStandard(); }
|
||||
|
||||
/**
|
||||
* Is it part of the standard, or just for my testing?
|
||||
*/
|
||||
public boolean isStandard() { return isStandard; }
|
||||
|
||||
/**
|
||||
* Get the property name. Style is SHORT, NORMAL, LONG
|
||||
*/
|
||||
public String getName(byte style) {
|
||||
if (style == NORMAL) style = defaultStyle;
|
||||
return style < LONG ? shortName : name;
|
||||
}
|
||||
|
||||
/** Header used in DerivedXXX files
|
||||
*/
|
||||
public String getHeader() { return header; }
|
||||
|
||||
/**
|
||||
* Does getProperty vary in contents?
|
||||
*/
|
||||
public boolean propertyVaries() { return false; }
|
||||
|
||||
/**
|
||||
* Get the property value as a string, or "" if hasProperty is false
|
||||
*/
|
||||
public String getProperty(int cp) { return hasProperty(cp) ? name : ""; }
|
||||
|
||||
/**
|
||||
* Does it have the propertyValue
|
||||
*/
|
||||
abstract boolean hasProperty(int cp);
|
||||
}
|
||||
|
Loading…
Add table
Reference in a new issue