mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-0 updates for uca 4.1.0
X-SVN-Rev: 17468
This commit is contained in:
parent
0e9c21e7e9
commit
ac3cc9119b
11 changed files with 440 additions and 150 deletions
|
@ -46,7 +46,10 @@ public class BagFormatter {
|
|||
"'>' > '>' ;";
|
||||
|
||||
private static final String HTML_RULES = BASE_RULES + CONTENT_RULES +
|
||||
"'\"' > '"' ; ";
|
||||
"'\"' > '"' ; ";
|
||||
|
||||
private static final String HTML_RULES_CONTROLS = HTML_RULES +
|
||||
"([[:C:][:Z:][:whitespace:][:Default_Ignorable_Code_Point:]]) > &hex/unicode($1) ; ";
|
||||
|
||||
private static final String XML_RULES = HTML_RULES +
|
||||
"'' > ''' ; ";
|
||||
|
@ -94,6 +97,8 @@ the double-quote character (") as """.
|
|||
|
||||
public static final Transliterator toHTML = Transliterator.createFromRules(
|
||||
"any-html", HTML_RULES, Transliterator.FORWARD);
|
||||
public static final Transliterator toHTMLControl = Transliterator.createFromRules(
|
||||
"any-html", HTML_RULES_CONTROLS, Transliterator.FORWARD);
|
||||
public static final Transliterator fromHTML = Transliterator.createFromRules(
|
||||
"html-any", HTML_RULES, Transliterator.REVERSE);
|
||||
|
||||
|
@ -151,6 +156,14 @@ the double-quote character (") as """.
|
|||
return result.getBuffer().toString();
|
||||
}
|
||||
|
||||
public void showSetDifferences(
|
||||
PrintWriter pw,
|
||||
String name1,
|
||||
UnicodeSet set1,
|
||||
String name2,
|
||||
UnicodeSet set2) {
|
||||
showSetDifferences(pw, name1, set1, name2, set2, -1);
|
||||
}
|
||||
/**
|
||||
* Compare two UnicodeSets, and show the differences
|
||||
* @param name1 name of first set to be compared
|
||||
|
@ -164,24 +177,37 @@ the double-quote character (") as """.
|
|||
String name1,
|
||||
UnicodeSet set1,
|
||||
String name2,
|
||||
UnicodeSet set2) {
|
||||
UnicodeSet set2,
|
||||
int flags)
|
||||
{
|
||||
if (pw == null) pw = CONSOLE;
|
||||
String[] names = { name1, name2 };
|
||||
|
||||
UnicodeSet temp = new UnicodeSet(set1).removeAll(set2);
|
||||
pw.println();
|
||||
pw.println(inOut.format(names));
|
||||
showSetNames(pw, temp);
|
||||
UnicodeSet temp;
|
||||
|
||||
if ((flags&1) != 0) {
|
||||
temp = new UnicodeSet(set1).removeAll(set2);
|
||||
pw.print(lineSeparator);
|
||||
pw.print(inOut.format(names));
|
||||
pw.print(lineSeparator);
|
||||
showSetNames(pw, temp);
|
||||
}
|
||||
|
||||
temp = new UnicodeSet(set2).removeAll(set1);
|
||||
pw.println();
|
||||
pw.println(outIn.format(names));
|
||||
showSetNames(pw, temp);
|
||||
if ((flags&2) != 0) {
|
||||
temp = new UnicodeSet(set2).removeAll(set1);
|
||||
pw.print(lineSeparator);
|
||||
pw.print(outIn.format(names));
|
||||
pw.print(lineSeparator);
|
||||
showSetNames(pw, temp);
|
||||
}
|
||||
|
||||
temp = new UnicodeSet(set2).retainAll(set1);
|
||||
pw.println();
|
||||
pw.println(inIn.format(names));
|
||||
showSetNames(pw, temp);
|
||||
if ((flags&4) != 0) {
|
||||
temp = new UnicodeSet(set2).retainAll(set1);
|
||||
pw.print(lineSeparator);
|
||||
pw.print(inIn.format(names));
|
||||
pw.print(lineSeparator);
|
||||
showSetNames(pw, temp);
|
||||
}
|
||||
pw.flush();
|
||||
}
|
||||
|
||||
|
@ -397,12 +423,14 @@ the double-quote character (") as """.
|
|||
|
||||
// refactored
|
||||
public String getName(int codePoint, boolean withCodePoint) {
|
||||
return getNameSource().getValue(codePoint, !withCodePoint);
|
||||
String result = getNameSource().getValue(codePoint, !withCodePoint);
|
||||
return fixName == null ? result : fixName.transliterate(result);
|
||||
}
|
||||
|
||||
public String getName(String s, boolean withCodePoint) {
|
||||
return getNameSource().getValue(s, separator, !withCodePoint);
|
||||
}
|
||||
String result = getNameSource().getValue(s, separator, !withCodePoint);
|
||||
return fixName == null ? result : fixName.transliterate(result);
|
||||
}
|
||||
|
||||
public String hex(String s) {
|
||||
return hex(s,separator);
|
||||
|
@ -445,6 +473,7 @@ the double-quote character (") as """.
|
|||
|
||||
private boolean mergeRanges = true;
|
||||
private Transliterator showLiteral = null;
|
||||
private Transliterator fixName = null;
|
||||
private boolean showSetAlso = false;
|
||||
|
||||
private RangeFinder rf = new RangeFinder();
|
||||
|
@ -580,10 +609,16 @@ the double-quote character (") as """.
|
|||
doAt((Visitor.CodePointRange) o);
|
||||
} else {
|
||||
String thing = o.toString();
|
||||
String value = getValueSource() == UnicodeLabel.NULL ? "" : getValueSource().getValue(thing, ",", true);
|
||||
if (value.length() != 0) value = "\t; " + value;
|
||||
String label = getLabelSource(true).getValue(thing, ",", true);
|
||||
if (label.length() != 0) label = " " + label;
|
||||
output.print(
|
||||
myTabber.process(
|
||||
hex(thing)
|
||||
+ value
|
||||
+ commentSeparator
|
||||
+ label
|
||||
+ insertLiteral(thing)
|
||||
+ "\t"
|
||||
+ getName(thing))
|
||||
|
@ -1095,4 +1130,16 @@ the double-quote character (") as """.
|
|||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the fixName.
|
||||
*/
|
||||
public Transliterator getFixName() {
|
||||
return fixName;
|
||||
}
|
||||
/**
|
||||
* @param fixName The fixName to set.
|
||||
*/
|
||||
public void setFixName(Transliterator fixName) {
|
||||
this.fixName = fixName;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -121,7 +121,7 @@ public abstract class UnicodeProperty extends UnicodeLabel {
|
|||
public List getValueAliases(String valueAlias, List result) {
|
||||
if (result == null) result = new ArrayList(1);
|
||||
result = _getValueAliases(valueAlias, result);
|
||||
if (!result.contains(valueAlias) && type < NUMERIC) {
|
||||
if (!result.contains(valueAlias) ) { // FIX && type < NUMERIC
|
||||
result = _getValueAliases(valueAlias, result); // for debugging
|
||||
throw new IllegalArgumentException(
|
||||
"Internal error: " + getName() + " doesn't contain " + valueAlias
|
||||
|
@ -609,6 +609,7 @@ public abstract class UnicodeProperty extends UnicodeLabel {
|
|||
}
|
||||
|
||||
private class PropertySymbolTable implements SymbolTable {
|
||||
static final boolean DEBUG = false;
|
||||
private String prefix;
|
||||
RegexMatcher regexMatcher = new RegexMatcher();
|
||||
|
||||
|
@ -698,7 +699,7 @@ public abstract class UnicodeProperty extends UnicodeLabel {
|
|||
int i;
|
||||
for (i = start; i < limit; i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(text, i);
|
||||
if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp)) {
|
||||
if (!com.ibm.icu.lang.UCharacter.isUnicodeIdentifierPart(cp) && cp != '.') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -876,7 +877,9 @@ public abstract class UnicodeProperty extends UnicodeLabel {
|
|||
|
||||
public static abstract class BaseProperty extends UnicodeProperty {
|
||||
protected List propertyAliases = new ArrayList(1);
|
||||
String version;
|
||||
protected Map toValueAliases;
|
||||
protected String version;
|
||||
|
||||
public BaseProperty setMain(String alias, String shortAlias, int propertyType,
|
||||
String version) {
|
||||
setName(alias);
|
||||
|
@ -893,12 +896,56 @@ public abstract class UnicodeProperty extends UnicodeLabel {
|
|||
addAllUnique(propertyAliases, result);
|
||||
return result;
|
||||
}
|
||||
public BaseProperty addValueAliases(String[][] valueAndAlternates) {
|
||||
if (toValueAliases == null) _fixValueAliases();
|
||||
for (int i = 0; i < valueAndAlternates.length; ++i) {
|
||||
for (int j = 1; j < valueAndAlternates[0].length; ++j) {
|
||||
addValueAlias(valueAndAlternates[i][0], valueAndAlternates[i][j]);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
public void addValueAlias(String value, String valueAlias) {
|
||||
List result = (List) toValueAliases.get(value);
|
||||
addUnique(value, result);
|
||||
addUnique(valueAlias, result);
|
||||
}
|
||||
protected List _getValueAliases(String valueAlias, List result) {
|
||||
if (toValueAliases == null) _fixValueAliases();
|
||||
List a = (List) toValueAliases.get(valueAlias);
|
||||
if (a != null) addAllUnique(a, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
protected void _fixValueAliases() {
|
||||
if (toValueAliases == null) toValueAliases = new HashMap(1);
|
||||
for (Iterator it = getAvailableValues().iterator(); it.hasNext();) {
|
||||
Object value = it.next();
|
||||
List result;
|
||||
_ensureValueInAliases(value);
|
||||
}
|
||||
}
|
||||
protected void _ensureValueInAliases(Object value) {
|
||||
List result = (List) toValueAliases.get(value);
|
||||
if (result == null) toValueAliases.put(value, result = new ArrayList(1));
|
||||
addUnique(value, result);
|
||||
}
|
||||
public BaseProperty swapFirst2ValueAliases() {
|
||||
for (Iterator it = toValueAliases.keySet().iterator(); it.hasNext();) {
|
||||
List list = (List) toValueAliases.get(it.next());
|
||||
if (list.size() < 2) continue;
|
||||
Object first = list.get(0);
|
||||
list.set(0, list.get(1));
|
||||
list.set(1, first);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static abstract class SimpleProperty extends BaseProperty {
|
||||
List values;
|
||||
Map toValueAliases = new HashMap(1);
|
||||
|
||||
public SimpleProperty addName(String alias) {
|
||||
propertyAliases.add(alias);
|
||||
|
@ -918,62 +965,52 @@ public abstract class UnicodeProperty extends UnicodeLabel {
|
|||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public SimpleProperty setValues(List valueAliases) {
|
||||
this.values = new ArrayList(valueAliases);
|
||||
for (Iterator it = this.values.iterator(); it.hasNext(); ) {
|
||||
_addToValues(it.next(), null);
|
||||
_addToValues((String)it.next(), null);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public List _getValueAliases(String valueAlias, List result) {
|
||||
if (toValueAliases == null) _fillValues();
|
||||
List a = (List) toValueAliases.get(valueAlias);
|
||||
if (a != null) addAllUnique(a, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
public List _getAvailableValues(List result) {
|
||||
if (values == null) _fillValues();
|
||||
result.addAll(values);
|
||||
return result;
|
||||
}
|
||||
|
||||
private void _fillValues() {
|
||||
|
||||
protected void _fillValues() {
|
||||
List newvalues = (List) getUnicodeMap().getAvailableValues(new ArrayList());
|
||||
for (Iterator it = newvalues.iterator(); it.hasNext();) {
|
||||
_addToValues(it.next(), null);
|
||||
_addToValues((String)it.next(), null);
|
||||
}
|
||||
}
|
||||
|
||||
private void _addToValues(Object item, Object alias) {
|
||||
|
||||
private void _addToValues(String item, String alias) {
|
||||
if (values == null) values = new ArrayList(1);
|
||||
if (toValueAliases == null) _fixValueAliases();
|
||||
addUnique(item, values);
|
||||
List aliases = (List) toValueAliases.get(item);
|
||||
if (aliases == null) {
|
||||
aliases = new ArrayList(1);
|
||||
toValueAliases.put(item, aliases);
|
||||
}
|
||||
addUnique(alias, aliases);
|
||||
addUnique(item, aliases);
|
||||
_ensureValueInAliases(item);
|
||||
addValueAlias(item, alias);
|
||||
}
|
||||
public String _getVersion() {
|
||||
/* public String _getVersion() {
|
||||
return version;
|
||||
}
|
||||
}
|
||||
*/ }
|
||||
|
||||
public static class UnicodeMapProperty extends BaseProperty {
|
||||
protected UnicodeMap unicodeMap;
|
||||
protected String _getValue(int codepoint) {
|
||||
return (String) unicodeMap.getValue(codepoint);
|
||||
}
|
||||
protected List _getValueAliases(String valueAlias, List result) {
|
||||
/* protected List _getValueAliases(String valueAlias, List result) {
|
||||
if (!unicodeMap.getAvailableValues().contains(valueAlias)) return result;
|
||||
result.add(valueAlias);
|
||||
return result; // no other aliases
|
||||
}
|
||||
protected List _getAvailableValues(List result) {
|
||||
*/ protected List _getAvailableValues(List result) {
|
||||
return (List) unicodeMap.getAvailableValues(result);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/GenOverlap.java,v $
|
||||
* $Date: 2004/02/07 01:01:12 $
|
||||
* $Revision: 1.12 $
|
||||
* $Date: 2005/04/06 08:48:16 $
|
||||
* $Revision: 1.13 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -164,8 +164,8 @@ public class GenOverlap implements UCD_Types, UCA_Types {
|
|||
static boolean PROGRESS = false;
|
||||
|
||||
static void fullCheck() throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "Overlap.html", Utility.UTF8_WINDOWS);
|
||||
PrintWriter simpleList = Utility.openPrintWriter(UCA_GEN_DIR, "Overlap.txt", Utility.UTF8_WINDOWS);
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.html", Utility.UTF8_WINDOWS);
|
||||
PrintWriter simpleList = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Overlap.txt", Utility.UTF8_WINDOWS);
|
||||
|
||||
Iterator it = completes.keySet().iterator();
|
||||
int counter = 0;
|
||||
|
@ -448,7 +448,7 @@ public class GenOverlap implements UCD_Types, UCA_Types {
|
|||
newKeys.removeAll(joint);
|
||||
oldKeys.removeAll(joint);
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS);
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCA-old-vs-new" + (doMax ? "-MAX.txt" : ".txt"), Utility.UTF8_WINDOWS);
|
||||
Iterator it = list.iterator();
|
||||
int last = -1;
|
||||
while (it.hasNext()) {
|
||||
|
@ -631,7 +631,7 @@ public class GenOverlap implements UCD_Types, UCA_Types {
|
|||
|
||||
System.out.println("Data Gathered");
|
||||
|
||||
PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "checkstringsearchhash.html", Utility.UTF8_WINDOWS);
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "checkstringsearchhash.html", Utility.UTF8_WINDOWS);
|
||||
Utility.writeHtmlHeader(log, "Check Hash");
|
||||
log.println("<h1>Collisions</h1>");
|
||||
log.println("<p>Shows collisions among primary values when hashed to table size = " + tableLength + ".");
|
||||
|
@ -694,7 +694,7 @@ public class GenOverlap implements UCD_Types, UCA_Types {
|
|||
}
|
||||
|
||||
public static void listCyrillic(UCA collatorIn) throws IOException {
|
||||
PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, "ListCyrillic.txt", Utility.UTF8_WINDOWS);
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "ListCyrillic.txt", Utility.UTF8_WINDOWS);
|
||||
Set set = new TreeSet(collatorIn);
|
||||
Set set2 = new TreeSet(collatorIn);
|
||||
ucd = UCD.make();
|
||||
|
|
|
@ -168,7 +168,7 @@ public class Implicit implements UCD_Types {
|
|||
*/
|
||||
public Implicit(int minPrimary, int maxPrimary) {
|
||||
// 13 is the largest 4-byte gap we can use without getting 2 four-byte forms.
|
||||
this(minPrimary, maxPrimary, 0x03, 0xFE, 1, 1);
|
||||
this(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -181,6 +181,14 @@ public class Implicit implements UCD_Types {
|
|||
* @param primaries3count number of 3-byte primarys we can use (normally 1)
|
||||
*/
|
||||
public Implicit(int minPrimary, int maxPrimary, int minTrail, int maxTrail, int gap3, int primaries3count) {
|
||||
if (DEBUG) {
|
||||
System.out.println("minPrimary: " + Utility.hex(minPrimary));
|
||||
System.out.println("maxPrimary: " + Utility.hex(maxPrimary));
|
||||
System.out.println("minTrail: " + Utility.hex(minTrail));
|
||||
System.out.println("maxTrail: " + Utility.hex(maxTrail));
|
||||
System.out.println("gap3: " + Utility.hex(gap3));
|
||||
System.out.println("primaries3count: " + primaries3count);
|
||||
}
|
||||
// some simple parameter checks
|
||||
if (minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) throw new IllegalArgumentException("bad lead bytes");
|
||||
if (minTrail < 0 || minTrail >= maxTrail || maxTrail > 0xFF) throw new IllegalArgumentException("bad trail bytes");
|
||||
|
|
|
@ -5,19 +5,24 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/Main.java,v $
|
||||
* $Date: 2004/01/15 01:08:30 $
|
||||
* $Revision: 1.18 $
|
||||
* $Date: 2005/04/06 08:48:16 $
|
||||
* $Revision: 1.19 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.text.UCA;
|
||||
import java.io.File;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.text.UCD.*;
|
||||
import com.ibm.text.utility.*;
|
||||
|
||||
|
||||
public class Main {
|
||||
static final String UCDVersion = "4.0.0";
|
||||
//static final String UCDVersion = "4.0.0";
|
||||
static final String[] ICU_FILES = {"writeCollationValidityLog", "writeFractionalUCA",
|
||||
"WriteRules", "WriteRulesXML", "writeconformance", "writeconformanceshifted",
|
||||
"short",
|
||||
|
@ -28,18 +33,10 @@ public class Main {
|
|||
};
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
|
||||
//checkCanonicalIterator();
|
||||
// NOTE: so far, we don't need to build the UCA with anything but the latest versions.
|
||||
// A few changes would need to be made to the code to do older versions.
|
||||
try {
|
||||
System.out.println("Building UCA");
|
||||
Default.setUCD(UCDVersion);
|
||||
WriteCollationData.collator = new UCA(null, UCDVersion);
|
||||
System.out.println("Built version " + WriteCollationData.collator.getDataVersion()
|
||||
+ "/ucd: " + WriteCollationData.collator.getUCDVersion());
|
||||
|
||||
System.out.println("Building UCD data");
|
||||
WriteCollationData.ucd = UCD.make(WriteCollationData.collator.getUCDVersion());
|
||||
|
||||
if (args.length == 0) args = new String[] {"?"}; // force the help comment
|
||||
boolean shortPrint = false;
|
||||
|
@ -54,7 +51,22 @@ public class Main {
|
|||
args = Utility.append(ICU_FILES, Utility.subarray(args, i+1));
|
||||
i = -1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (arg.equalsIgnoreCase("version")) {
|
||||
Default.setUCD(args[++i]); // get next arg
|
||||
continue;
|
||||
}
|
||||
if (WriteCollationData.collator == null) {
|
||||
System.out.println("Building UCA");
|
||||
String file = Utility.searchDirectory(new File(UCD_Types.BASE_DIR + "UCA\\" + Default.ucdVersion() + "\\"), "allkeys", true, ".txt");
|
||||
WriteCollationData.collator = new UCA(file, Default.ucdVersion());
|
||||
System.out.println("Built version " + WriteCollationData.collator.getDataVersion()
|
||||
+ "/ucd: " + WriteCollationData.collator.getUCDVersion());
|
||||
|
||||
System.out.println("Building UCD data");
|
||||
WriteCollationData.ucd = UCD.make(WriteCollationData.collator.getUCDVersion());
|
||||
|
||||
}
|
||||
if (arg.equalsIgnoreCase("GenOverlap")) GenOverlap.test(WriteCollationData.collator);
|
||||
else if (arg.equalsIgnoreCase("validateUCA")) GenOverlap.validateUCA(WriteCollationData.collator);
|
||||
//else if (arg.equalsIgnoreCase("writeNonspacingDifference")) WriteCollationData.writeNonspacingDifference();
|
||||
|
@ -125,4 +137,37 @@ public class Main {
|
|||
*/
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static void checkCanonicalIterator() {
|
||||
|
||||
int firstImplicit = WriteCollationData.getImplicitPrimary(UCD_Types.CJK_BASE);
|
||||
System.out.println("UCD_Types.CJK_BASE: " + Utility.hex(UCD_Types.CJK_BASE));
|
||||
System.out.println("first implicit: " + Utility.hex((long)(firstImplicit & 0xFFFFFFFFL)));
|
||||
|
||||
CanonicalIterator it = new CanonicalIterator("");
|
||||
String[] tests = new String[] {"\uF900"};
|
||||
for (int j = 0; j < tests.length; ++j) {
|
||||
System.out.println(tests[j]);
|
||||
it.setSource(tests[j]);
|
||||
String ss;
|
||||
for (int i = 0; (ss = it.next()) != null; ++i) {
|
||||
System.out.println(i + "\t" + Utility.hex(ss));
|
||||
}
|
||||
}
|
||||
if (true) throw new IllegalArgumentException();
|
||||
for (int i = 0; i < 0x10FFFF; ++i) {
|
||||
int cat = UCharacter.getType(i);
|
||||
if (cat == UCharacter.UNASSIGNED || cat == UCharacter.PRIVATE_USE || cat == UCharacter.SURROGATE) continue;
|
||||
String s = UTF16.valueOf(i);
|
||||
try {
|
||||
it.setSource(s);
|
||||
} catch (RuntimeException e) {
|
||||
System.out.println("Failure with U+" + Utility.hex(i));
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA.java,v $
|
||||
* $Date: 2004/02/06 18:32:03 $
|
||||
* $Revision: 1.23 $
|
||||
* $Date: 2005/04/06 08:48:16 $
|
||||
* $Revision: 1.24 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -14,6 +14,8 @@
|
|||
package com.ibm.text.UCA;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.Reader;
|
||||
import java.io.PrintWriter;
|
||||
|
@ -108,13 +110,16 @@ final public class UCA implements Comparator, UCA_Types {
|
|||
// Main Methods
|
||||
// =============================================================
|
||||
|
||||
private String fileVersion = "??";
|
||||
|
||||
/**
|
||||
* Initializes the collation from a stream of rules in the normal formal.
|
||||
* If the source is null, uses the normal Unicode data files, which
|
||||
* need to be in BASE_DIR.
|
||||
*/
|
||||
public UCA(BufferedReader source, String unicodeVersion) throws java.io.IOException {
|
||||
fullData = source == null;
|
||||
public UCA(String sourceFile, String unicodeVersion) throws java.io.IOException {
|
||||
fullData = sourceFile == null;
|
||||
fileVersion = sourceFile;
|
||||
|
||||
// load the normalizer
|
||||
if (toD == null) {
|
||||
|
@ -127,15 +132,19 @@ final public class UCA implements Comparator, UCA_Types {
|
|||
ucaData = new UCA_Data(toD, ucd);
|
||||
|
||||
// either get the full sources, or just a demo set
|
||||
if (fullData) {
|
||||
/* if (fullData) {
|
||||
for (int i = 0; i < KEYS.length; ++i) {
|
||||
BufferedReader in = new BufferedReader(
|
||||
new FileReader(KEYS[i]), BUFFER_SIZE);
|
||||
addCollationElements(in);
|
||||
in.close();
|
||||
}
|
||||
} else {
|
||||
addCollationElements(source);
|
||||
} else */
|
||||
{
|
||||
BufferedReader in = new BufferedReader(
|
||||
new FileReader(sourceFile), BUFFER_SIZE);
|
||||
addCollationElements(in);
|
||||
in.close();
|
||||
}
|
||||
cleanup();
|
||||
}
|
||||
|
@ -830,16 +839,17 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
/**
|
||||
* List of files to use for constructing the CE data, used by build()
|
||||
*/
|
||||
private static final String[] KEYS = {
|
||||
|
||||
/* private static final String[] KEYS = {
|
||||
//"D:\\UnicodeData\\testkeys.txt",
|
||||
BASE_DIR + "Collation\\allkeys" + VERSION + ".txt",
|
||||
/*
|
||||
BASE_DIR + "UCA\\allkeys" + VERSION + ".txt",
|
||||
|
||||
BASE_DIR + "UnicodeData\\Collation\\basekeys" + VERSION + ".txt",
|
||||
BASE_DIR + "UnicodeData\\Collation\\compkeys" + VERSION + ".txt",
|
||||
BASE_DIR + "UnicodeData\\Collation\\ctrckeys" + VERSION + ".txt",
|
||||
*/
|
||||
|
||||
};
|
||||
|
||||
*/
|
||||
/**
|
||||
* File buffer size, used to make reads faster.
|
||||
*/
|
||||
|
@ -1089,6 +1099,13 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
|
||||
static boolean haveUnspecified = false;
|
||||
static UnicodeSet unspecified = new UnicodeSet();
|
||||
UnicodeSet variantSecondaries = new UnicodeSet(0x0153,0x0154);
|
||||
UnicodeSet digitSecondaries = new UnicodeSet(0x155,0x017F);
|
||||
UnicodeSet homelessSecondaries;
|
||||
|
||||
// static UnicodeSet homelessSecondaries = new UnicodeSet(0x0176, 0x0198);
|
||||
// 0x0153..0x017F
|
||||
|
||||
|
||||
public class UCAContents {
|
||||
int current = -1;
|
||||
|
@ -1130,9 +1147,10 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
|
||||
/**
|
||||
* use FIXED_CE as the limit
|
||||
* @param newValue TODO
|
||||
*/
|
||||
public void enableSamples() {
|
||||
doSamples = true;
|
||||
public void setDoEnableSamples(boolean newValue) {
|
||||
doSamples = newValue;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1179,7 +1197,7 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
if (!haveUnspecified) {
|
||||
if (DEBUG) System.out.println("Specified = " + unspecified.toPattern(true));
|
||||
UnicodeSet temp = new UnicodeSet();
|
||||
for (int i = 0; i < 0x10ffff; ++i) {
|
||||
for (int i = 0; i <= 0x10ffff; ++i) {
|
||||
if (!ucd.isAllocated(i)) continue;
|
||||
if (!unspecified.contains(i)) {
|
||||
temp.add(i);
|
||||
|
@ -1265,6 +1283,12 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Returns the doSamples.
|
||||
*/
|
||||
public boolean isDoSamples() {
|
||||
return doSamples;
|
||||
}
|
||||
}
|
||||
|
||||
static final int[][] SAMPLE_RANGES = {
|
||||
|
@ -1312,6 +1336,14 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
while (true) try {
|
||||
inputLine = in.readLine();
|
||||
if (inputLine == null) break; // means file is done
|
||||
|
||||
// HACK
|
||||
if (inputLine.startsWith("# Variant secondaries:")) {
|
||||
variantSecondaries = extractSet(inputLine);
|
||||
} else if (inputLine.startsWith("# Digit secondaries:")) {
|
||||
digitSecondaries = extractSet(inputLine);
|
||||
}
|
||||
|
||||
String line = cleanLine(inputLine); // remove comments, extra whitespace
|
||||
if (line.length() == 0) continue; // skip empty lines
|
||||
|
||||
|
@ -1407,7 +1439,18 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private UnicodeSet extractSet(String inputLine) {
|
||||
//# Variant secondaries: 0177..017B (5)
|
||||
//# Digit secondaries: 017C..0198 (29)
|
||||
Matcher m = Pattern.compile(".*:\\s*([0-9A-Fa-f]+)\\.\\.([0-9A-Fa-f]+).*").matcher("");
|
||||
if (!m.reset(inputLine).matches()) throw new IllegalArgumentException("Failed to recognized special Ken lines: " + inputLine);
|
||||
return new UnicodeSet(Integer.parseInt(m.group(1),16), Integer.parseInt(m.group(2),16));
|
||||
}
|
||||
|
||||
/*
|
||||
private void concat(int[] ces1, int[] ces2) {
|
||||
|
||||
}
|
||||
|
@ -1737,4 +1780,25 @@ CP => [.AAAA.0020.0002.][.BBBB.0000.0000.]
|
|||
uniqueTable.put(ceObj, new Character(value));
|
||||
}
|
||||
}
|
||||
/**
|
||||
* @return Returns the fileVersion.
|
||||
*/
|
||||
public String getFileVersion() {
|
||||
return fileVersion;
|
||||
}
|
||||
/**
|
||||
* @return Returns the uCA_GEN_DIR.
|
||||
*/
|
||||
public String getUCA_GEN_DIR() {
|
||||
return BASE_UCA_GEN_DIR + getDataVersion() + "\\";
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @return Returns the homelessSecondaries.
|
||||
*/
|
||||
public UnicodeSet getHomelessSecondaries() {
|
||||
if (homelessSecondaries == null) homelessSecondaries = new UnicodeSet(variantSecondaries).addAll(digitSecondaries);
|
||||
return homelessSecondaries;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/UCA_Types.java,v $
|
||||
* $Date: 2004/01/13 18:32:11 $
|
||||
* $Revision: 1.6 $
|
||||
* $Date: 2005/04/06 08:48:17 $
|
||||
* $Revision: 1.7 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -20,11 +20,11 @@ public interface UCA_Types {
|
|||
* Version of the UCA tables to use
|
||||
*/
|
||||
//private static final String VERSION = "-3.0.1d3"; // ""; // "-2.1.9d7";
|
||||
public static final String UCA_BASE = "4.0.0"; // "3.1.1"; // ; // ""; // "-2.1.9d7";
|
||||
public static final String VERSION = "-" + UCA_BASE; // + "d6" ""; // "-2.1.9d7";
|
||||
//public static final String UCA_BASE = "4.1.0"; // "3.1.1"; // ; // ""; // "-2.1.9d7";
|
||||
//public static final String VERSION = "-" + UCA_BASE; // + "d6" ""; // "-2.1.9d7";
|
||||
public static final String ALLFILES = "allkeys"; // null if not there
|
||||
|
||||
public static final String UCA_GEN_DIR = UCD_Types.GEN_DIR + "collation_" + UCA_BASE + "\\";
|
||||
public static final String BASE_UCA_GEN_DIR = UCD_Types.GEN_DIR + "collation" + "\\";
|
||||
public static final char LEVEL_SEPARATOR = '\u0000';
|
||||
/**
|
||||
* Expanding characters are marked with a exception bit combination
|
||||
|
@ -94,5 +94,5 @@ public interface UCA_Types {
|
|||
CJK_CE = 3, CJK_AB_CE = 4, HANGUL_CE = 5, UNSUPPORTED_CE = 7,
|
||||
FIXED_CE = 3;
|
||||
// SURROGATE_CE = 6,
|
||||
|
||||
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCharts.java,v $
|
||||
* $Date: 2004/02/12 08:23:19 $
|
||||
* $Revision: 1.20 $
|
||||
* $Date: 2005/04/06 08:48:17 $
|
||||
* $Revision: 1.21 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -29,6 +29,7 @@ import java.text.SimpleDateFormat;
|
|||
|
||||
public class WriteCharts implements UCD_Types {
|
||||
|
||||
static String WORKING_DIR = ".\\com\\ibm\\text\\UCA\\";
|
||||
static boolean HACK_KANA = false;
|
||||
|
||||
static public void special() {
|
||||
|
@ -50,7 +51,7 @@ public class WriteCharts implements UCD_Types {
|
|||
//Normalizer nfc = new Normalizer(Normalizer.NFC);
|
||||
|
||||
UCA.UCAContents cc = uca.getContents(UCA.FIXED_CE, null); // nfd instead of null if skipping decomps
|
||||
cc.enableSamples();
|
||||
cc.setDoEnableSamples(true);
|
||||
|
||||
Set set = new TreeSet();
|
||||
|
||||
|
@ -84,12 +85,12 @@ public class WriteCharts implements UCD_Types {
|
|||
String[] replacement = new String[] {"%%%", "Collation Charts"};
|
||||
String folder = "charts\\uca\\";
|
||||
|
||||
Utility.copyTextFile("index.html", Utility.UTF8, folder + "index.html", replacement);
|
||||
Utility.copyTextFile("charts.css", Utility.LATIN1, folder + "charts.css");
|
||||
Utility.copyTextFile("help.html", Utility.UTF8, folder + "help.html");
|
||||
Utility.copyTextFile(WORKING_DIR + "index.html", Utility.UTF8, folder + "index.html", replacement);
|
||||
Utility.copyTextFile(WORKING_DIR + "charts.css", Utility.LATIN1, folder + "charts.css");
|
||||
Utility.copyTextFile(WORKING_DIR + "help.html", Utility.UTF8, folder + "help.html");
|
||||
|
||||
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
|
||||
Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
|
||||
/*
|
||||
indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
|
@ -100,6 +101,7 @@ public class WriteCharts implements UCD_Types {
|
|||
indexFile.println("<p align='center'><a href = 'help.html'>Help</a>");
|
||||
*/
|
||||
|
||||
int lastCp = -1;
|
||||
while (it.hasNext()) {
|
||||
Utility.dot(counter);
|
||||
|
||||
|
@ -110,6 +112,7 @@ public class WriteCharts implements UCD_Types {
|
|||
int cp = UTF16.charAt(s,0);
|
||||
|
||||
byte script = Default.ucd().getScript(cp);
|
||||
if (cp == 0x1DBF) script = UCD.GREEK_SCRIPT; // 4.1.0 hack
|
||||
|
||||
// get first non-zero primary
|
||||
int currentPrimary = getFirstPrimary(sortKey);
|
||||
|
@ -128,6 +131,7 @@ public class WriteCharts implements UCD_Types {
|
|||
if (script == KATAKANA_SCRIPT) script = HIRAGANA_SCRIPT;
|
||||
else if ((script == INHERITED_SCRIPT || script == COMMON_SCRIPT) && oldScript >= 0) script = oldScript;
|
||||
|
||||
int veryOldScript = oldScript;
|
||||
if (script != oldScript
|
||||
// && (script != COMMON_SCRIPT && script != INHERITED_SCRIPT)
|
||||
) {
|
||||
|
@ -140,7 +144,9 @@ public class WriteCharts implements UCD_Types {
|
|||
++scriptCount[script+3];
|
||||
if (scriptCount[script+3] > 1) {
|
||||
System.out.println("\t\tFAIL: " + scriptCount[script+3] + ", " +
|
||||
getChunkName(script, LONG) + ", " + Default.ucd().getCodeAndName(s));
|
||||
getChunkName(script, LONG) + ", " + Default.ucd().getCodeAndName(s)
|
||||
+ " - last char: "
|
||||
+ getChunkName(veryOldScript, LONG) + ", " + Default.ucd().getCodeAndName(lastCp));
|
||||
}
|
||||
output = openFile(scriptCount[script+3], folder, script);
|
||||
}
|
||||
|
@ -179,6 +185,7 @@ public class WriteCharts implements UCD_Types {
|
|||
|
||||
output.println(breaker + outline);
|
||||
++columnCount;
|
||||
lastCp = cp;
|
||||
}
|
||||
|
||||
closeFile(output);
|
||||
|
@ -265,7 +272,7 @@ public class WriteCharts implements UCD_Types {
|
|||
Utility.copyTextFile("norm_help.html", Utility.UTF8, folder + "help.html");
|
||||
|
||||
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
|
||||
Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
|
||||
/*
|
||||
indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
|
@ -373,7 +380,7 @@ public class WriteCharts implements UCD_Types {
|
|||
Utility.copyTextFile("case_help.html", Utility.UTF8, folder + "help.html");
|
||||
|
||||
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
|
||||
Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
|
||||
/*
|
||||
indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
|
@ -485,7 +492,7 @@ public class WriteCharts implements UCD_Types {
|
|||
Utility.copyTextFile("script_help.html", Utility.UTF8, folder + "help.html");
|
||||
|
||||
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
|
||||
Utility.appendFile("script_index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
Utility.appendFile(WORKING_DIR + "script_index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
|
||||
/*
|
||||
indexFile.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
|
@ -607,7 +614,7 @@ public class WriteCharts implements UCD_Types {
|
|||
Utility.copyTextFile("name_help.html", Utility.UTF8, folder + "help.html");
|
||||
|
||||
indexFile = Utility.openPrintWriter(folder + "index_list.html", Utility.UTF8_WINDOWS);
|
||||
Utility.appendFile("index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
Utility.appendFile(WORKING_DIR + "index_header.html", Utility.UTF8, indexFile, replacement);
|
||||
|
||||
int columnCount = 0;
|
||||
char lastInitial = 0;
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCA/WriteCollationData.java,v $
|
||||
* $Date: 2004/02/07 01:01:11 $
|
||||
* $Revision: 1.39 $
|
||||
* $Date: 2005/04/06 08:48:17 $
|
||||
* $Revision: 1.40 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -17,6 +17,9 @@ import java.util.*;
|
|||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.text.CanonicalIterator;
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.dev.test.util.UnicodePropertySource;
|
||||
import com.ibm.icu.impl.UCharacterProperty;
|
||||
|
||||
import java.io.*;
|
||||
|
@ -36,6 +39,8 @@ import com.ibm.text.UCD.Normalizer;
|
|||
|
||||
public class WriteCollationData implements UCD_Types, UCA_Types {
|
||||
|
||||
// may require fixing
|
||||
|
||||
static final boolean DEBUG = false;
|
||||
static final boolean DEBUG_SHOW_ITERATION = false;
|
||||
|
||||
|
@ -145,7 +150,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
|
|||
BufferedReader in = Utility.openUnicodeFile("CaseFolding", UNICODE_VERSION, true, Utility.LATIN1);
|
||||
// new BufferedReader(new FileReader(DIR31 + "CaseFolding-3.d3.alpha.txt"), 64*1024);
|
||||
// log = new PrintWriter(new FileOutputStream("CaseFolding_data.js"));
|
||||
log = Utility.openPrintWriter(UCA_GEN_DIR, "CaseFolding_data.js", Utility.UTF8_WINDOWS);
|
||||
log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "CaseFolding_data.js", Utility.UTF8_WINDOWS);
|
||||
log.println("var CF = new Object();");
|
||||
int count = 0;
|
||||
while (true) {
|
||||
|
@ -190,7 +195,7 @@ public class WriteCollationData implements UCD_Types, UCA_Types {
|
|||
//Normalizer normKD = new Normalizer(Normalizer.NFKD, UNICODE_VERSION);
|
||||
//Normalizer normD = new Normalizer(Normalizer.NFD, UNICODE_VERSION);
|
||||
//log = new PrintWriter(new FileOutputStream("Normalization_data.js"));
|
||||
log = Utility.openPrintWriter(UCA_GEN_DIR, "Normalization_data.js", Utility.LATIN1_WINDOWS);
|
||||
log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "Normalization_data.js", Utility.LATIN1_WINDOWS);
|
||||
|
||||
|
||||
int count = 0;
|
||||
|
@ -319,7 +324,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
}
|
||||
|
||||
String fullFileName = filename + (shortPrint ? "_SHORT" : "") + ".txt";
|
||||
PrintWriter log = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
|
||||
PrintWriter log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS);
|
||||
//if (!shortPrint) log.write('\uFEFF');
|
||||
writeVersionAndDate(log, fullFileName);
|
||||
|
||||
|
@ -327,7 +332,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
int counter = 0;
|
||||
|
||||
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
|
||||
cc.enableSamples();
|
||||
cc.setDoEnableSamples(true);
|
||||
UnicodeSet found2 = new UnicodeSet();
|
||||
|
||||
while (true) {
|
||||
|
@ -711,7 +716,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
|
||||
static void testCompatibilityCharacters() throws IOException {
|
||||
String fullFileName = "UCA_CompatComparison.txt";
|
||||
log = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
|
||||
log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS);
|
||||
|
||||
int[] kenCes = new int[50];
|
||||
int[] markCes = new int[50];
|
||||
|
@ -1191,7 +1196,13 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
while (it.hasNext()) {
|
||||
if (result.length() != 0) result.append(";<br>");
|
||||
Object item = it.next();
|
||||
if (m != null) item = m.get(item);
|
||||
if (m != null) {
|
||||
Object item2 = m.get(item);
|
||||
if (item2 != null) item = item2;
|
||||
else {
|
||||
System.out.println("Missing Item: " + item);
|
||||
}
|
||||
}
|
||||
if (useName) item = ucd.getCodeAndName(item.toString());
|
||||
result.append(item);
|
||||
}
|
||||
|
@ -1207,7 +1218,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
32*1024));
|
||||
*/
|
||||
String fullFileName = "UCA_Contractions.txt";
|
||||
PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, fullFileName, Utility.UTF8_WINDOWS);
|
||||
PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), fullFileName, Utility.UTF8_WINDOWS);
|
||||
|
||||
diLog.write('\uFEFF');
|
||||
|
||||
|
@ -1246,7 +1257,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
"UTF8"),
|
||||
32*1024));
|
||||
*/
|
||||
PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, "DisjointIgnorables.js", Utility.UTF8_WINDOWS);
|
||||
PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "DisjointIgnorables.js", Utility.UTF8_WINDOWS);
|
||||
|
||||
diLog.write('\uFEFF');
|
||||
|
||||
|
@ -1425,7 +1436,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
"UTF8"),
|
||||
32*1024));
|
||||
*/
|
||||
PrintWriter diLog = Utility.openPrintWriter(UCA_GEN_DIR, "DisjointIgnorables2.js", Utility.UTF8_WINDOWS);
|
||||
PrintWriter diLog = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "DisjointIgnorables2.js", Utility.UTF8_WINDOWS);
|
||||
|
||||
diLog.write('\uFEFF');
|
||||
|
||||
|
@ -1637,7 +1648,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
if (UCA.isImplicitLeadCE(ces[0])) {
|
||||
expansionStart = 2; // move up if first is double-ce
|
||||
}
|
||||
if (len > expansionStart && homelessSecondaries.contains(UCA.getSecondary(ces[expansionStart]))) {
|
||||
if (len > expansionStart && collator.getHomelessSecondaries().contains(UCA.getSecondary(ces[expansionStart]))) {
|
||||
if (log2 != null) log2.println("Homeless: " + CEList.toString(ces, len));
|
||||
++expansionStart; // move up if *second* is homeless ignoreable
|
||||
}
|
||||
|
@ -1674,7 +1685,7 @@ U+01D5 LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
|
|||
int[] lenArray = new int[1];
|
||||
|
||||
Set alreadyDone = new HashSet();
|
||||
log2 = Utility.openPrintWriter(UCA_GEN_DIR, "UCARules-log.txt", Utility.UTF8_WINDOWS);
|
||||
log2 = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "UCARules-log.txt", Utility.UTF8_WINDOWS);
|
||||
|
||||
while (true) {
|
||||
String s = cc.next(ces, lenArray);
|
||||
|
@ -1799,7 +1810,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
if (noCE) filename += "_NoCE";
|
||||
if (option == IN_XML) filename += ".xml"; else filename += ".txt";
|
||||
|
||||
log = Utility.openPrintWriter(UCA_GEN_DIR, filename, Utility.UTF8_WINDOWS);
|
||||
log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), filename, Utility.UTF8_WINDOWS);
|
||||
|
||||
String[] commentText = {
|
||||
"UCA Rules",
|
||||
|
@ -2316,8 +2327,6 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
}
|
||||
|
||||
|
||||
static UnicodeSet homelessSecondaries = new UnicodeSet(0x0153,0x017F);
|
||||
|
||||
/*static int[] ignorableList = new int[homelessSecondaries.size()];
|
||||
|
||||
static {
|
||||
|
@ -2396,7 +2405,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
}
|
||||
if (s == null) {
|
||||
do {
|
||||
if (homelessSecondaries.contains(UCA.getSecondary(ces[i]))) {
|
||||
if (collator.getHomelessSecondaries().contains(UCA.getSecondary(ces[i]))) {
|
||||
s = "";
|
||||
if (rel[0] > 1) rel[0] = 1; // HACK
|
||||
break;
|
||||
|
@ -2846,11 +2855,11 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
|
||||
Utility.fixDot();
|
||||
System.out.println("Writing");
|
||||
PrintWriter shortLog = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + "_SHORT.txt"), 32*1024));
|
||||
PrintWriter longLog = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + ".txt"), 32*1024));
|
||||
PrintWriter shortLog = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + "_SHORT.txt"), 32*1024));
|
||||
PrintWriter longLog = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + ".txt"), 32*1024));
|
||||
log = new PrintWriter(new DualWriter(shortLog, longLog));
|
||||
|
||||
PrintWriter summary = new PrintWriter(new BufferedWriter(new FileWriter(UCA_GEN_DIR + filename + "_summary.txt"), 32*1024));
|
||||
PrintWriter summary = new PrintWriter(new BufferedWriter(new FileWriter(collator.getUCA_GEN_DIR() + filename + "_summary.txt"), 32*1024));
|
||||
//log.println("[Variable Low = " + UCA.toString(collator.getVariableLow()) + "]");
|
||||
//log.println("[Variable High = " + UCA.toString(collator.getVariableHigh()) + "]");
|
||||
|
||||
|
@ -3976,7 +3985,7 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
static void writeCollationValidityLog() throws IOException {
|
||||
|
||||
//log = new PrintWriter(new FileOutputStream("CheckCollationValidity.html"));
|
||||
log = Utility.openPrintWriter(UCA_GEN_DIR, "CheckCollationValidity.html", Utility.UTF8_WINDOWS);
|
||||
log = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), "CheckCollationValidity.html", Utility.UTF8_WINDOWS);
|
||||
|
||||
log.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'>");
|
||||
log.println("<title>UCA Validity Log</title>");
|
||||
|
@ -4002,15 +4011,18 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
*/
|
||||
|
||||
UCA.UCAContents cc = collator.getContents(UCA.FIXED_CE, null);
|
||||
cc.enableSamples();
|
||||
//cc.setDoEnableSamples(true);
|
||||
UnicodeSet coverage = new UnicodeSet();
|
||||
|
||||
while (true) {
|
||||
String s = cc.next();
|
||||
if (s == null) break;
|
||||
addString(s, option);
|
||||
coverage.add(s);
|
||||
}
|
||||
|
||||
|
||||
System.out.println("Total: " + sortedD.size());
|
||||
|
||||
Iterator it;
|
||||
|
||||
//ucd.init();
|
||||
|
@ -4051,7 +4063,10 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
|
||||
log.println("<h1>Collation Validity Checks</h1>");
|
||||
log.println("<table><tr><td>Generated: </td><td>" + getNormalDate() + "</td></tr>");
|
||||
log.println("<tr><td>File Version: </td><td>" + collator.getDataVersion() + "/" + collator.getUCDVersion() + "</td></tr></table>");
|
||||
log.println("<tr><td>Unicode Version: </td><td>" + collator.getUCDVersion());
|
||||
log.println("<tr><td>UCA Data Version (@version in file): </td><td>" + collator.getDataVersion());
|
||||
log.println("<tr><td>UCA File Name: </td><td>" + collator.getFileVersion());
|
||||
log.println("</td></tr></table>");
|
||||
|
||||
if (collator.getDataVersion() == UCA.BADVERSION) {
|
||||
log.println(SERIOUS_ERROR);
|
||||
|
@ -4076,6 +4091,24 @@ F900..FAFF; CJK Compatibility Ideographs
|
|||
addClosure();
|
||||
writeDuplicates();
|
||||
writeOverlap();
|
||||
|
||||
log.println("<h2>Coverage</h2>");
|
||||
BagFormatter bf = new BagFormatter();
|
||||
bf.setLineSeparator("<br>\r\n");
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
bf.setUnicodePropertyFactory(ups);
|
||||
bf.setShowLiteral(bf.toHTML);
|
||||
bf.setFixName(bf.toHTML);
|
||||
UCD ucd = Default.ucd();
|
||||
UnicodeProperty cat = ups.getProperty("gc");
|
||||
UnicodeSet ucd410 = cat.getSet("Cn")
|
||||
.addAll(cat.getSet("Co"))
|
||||
.addAll(cat.getSet("Cs"))
|
||||
.complement()
|
||||
//.addAll(ups.getSet("Noncharactercodepoint=true"))
|
||||
//.addAll(ups.getSet("Default_Ignorable_Code_Point=true"))
|
||||
;
|
||||
bf.showSetDifferences(log, "UCD4.1.0", ucd410, "UCA4.1.0", coverage, 3);
|
||||
|
||||
log.println("</body></html>");
|
||||
log.close();
|
||||
|
@ -4670,7 +4703,7 @@ A4C6;YI RADICAL KE;So;0;ON;;;;;N;;;;;
|
|||
|
||||
static PrintWriter writeHead(int counter, int end, String title, String other, String version, boolean show) throws IOException {
|
||||
|
||||
PrintWriter out = Utility.openPrintWriter(UCA_GEN_DIR, title + pad(counter) + ".html", Utility.UTF8_WINDOWS);
|
||||
PrintWriter out = Utility.openPrintWriter(collator.getUCA_GEN_DIR(), title + pad(counter) + ".html", Utility.UTF8_WINDOWS);
|
||||
|
||||
copyFile(out, "HTML-Part1.txt");
|
||||
/*
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2005/03/30 17:19:32 $
|
||||
* $Revision: 1.20 $
|
||||
* $Date: 2005/04/06 08:48:17 $
|
||||
* $Revision: 1.21 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -21,6 +21,7 @@ import java.text.SimpleDateFormat;
|
|||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.dev.test.util.ICUPropertyFactory;
|
||||
import com.ibm.icu.dev.test.util.UnicodeLabel;
|
||||
import com.ibm.icu.dev.test.util.UnicodeMap;
|
||||
import com.ibm.icu.dev.test.util.UnicodeProperty;
|
||||
import com.ibm.icu.impl.ICUData;
|
||||
import com.ibm.icu.impl.ICUResourceBundle;
|
||||
|
@ -153,17 +154,23 @@ public class TestData implements UCD_Types {
|
|||
static class GenStringPrep {
|
||||
UnicodeSet[] coreChars = new UnicodeSet[100];
|
||||
UnicodeSet decomposable = new UnicodeSet();
|
||||
UnicodeMap suspect = new UnicodeMap();
|
||||
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("");
|
||||
//UnicodeSet id_continue = ups.getSet("ID_Continue=true");
|
||||
UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher())
|
||||
.retainAll(ups.getSet("gc=Sk"))
|
||||
.addAll(new UnicodeSet("[\u0027 \u002D \u002E \u003A \u00B7 \u058A \u05F3" +
|
||||
" \u05F4 \u200C \u200D \u2010 \u2019 \u2027 \u30A0]"));
|
||||
UnicodeSet xid_continue = ups.getSet("XID_Continue=true");
|
||||
UnicodeSet wordChars = ups.getSet("name=.*MODIFIER LETTER.*", new RegexMatcher());
|
||||
{
|
||||
wordChars.retainAll(ups.getSet("gc=Sk"));
|
||||
wordChars.addAll(new UnicodeSet("[\\u0027 \\u002D \\u002E \\u003A \\u00B7 \\u058A \\u05F3" +
|
||||
" \\u05F4 \\u200C \\u200D \\u2010 \\u2019 \\u2027 \\u30A0 \\u04C0]"));
|
||||
//wordChars.removeAll(xid_continue);
|
||||
}
|
||||
|
||||
UnicodeSet patternProp = ups.getSet("Pattern_Syntax=true").removeAll(wordChars);
|
||||
UnicodeSet isNFKC = ups.getSet("NFKC_Quickcheck=NO").complement();
|
||||
|
||||
UnicodeSet not_xid_continue = ups.getSet("XID_Continue=true").complement().removeAll(wordChars);
|
||||
UnicodeSet not_xid_continue = new UnicodeSet(xid_continue).complement().removeAll(wordChars);
|
||||
|
||||
//UnicodeSet[] decompChars = new UnicodeSet[100];
|
||||
UCD ucd = Default.ucd();
|
||||
|
@ -180,7 +187,8 @@ public class TestData implements UCD_Types {
|
|||
"[[:Bidi_Class=AL:][:Bidi_Class=R:]]");
|
||||
|
||||
UnicodeSet bidiL = new UnicodeSet("[:Bidi_Class=l:]");
|
||||
UnicodeSet hasUpper = new UnicodeSet();
|
||||
UnicodeSet hasNoUpper = new UnicodeSet();
|
||||
UnicodeSet hasNoUpperMinus = new UnicodeSet();
|
||||
BagFormatter bf = new BagFormatter();
|
||||
UnicodeSet inIDN = new UnicodeSet();
|
||||
|
||||
|
@ -200,16 +208,16 @@ public class TestData implements UCD_Types {
|
|||
if (!Default.nfd().isNormalized(cp)) decomposable.add(cp);
|
||||
int idnaType = getIDNAType(cp);
|
||||
idnaTypeSet[idnaType].add(cp);
|
||||
String str = UTF16.valueOf(cp);
|
||||
if (str.equals(ucd.getCase(str, FULL, UPPER))) hasNoUpper.add(cp);
|
||||
int script = ucd.getScript(cp);
|
||||
if (coreChars[script] == null)
|
||||
coreChars[script] = new UnicodeSet();
|
||||
coreChars[script].add(cp);
|
||||
}
|
||||
// find characters with no uppercase
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(lowercase); it.next();) {
|
||||
String str = UTF16.valueOf(it.codepoint);
|
||||
if (!str.equals(ucd.getCase(str, FULL, UPPER))) hasUpper.add(it.codepoint);
|
||||
}
|
||||
// fix characters with no uppercase
|
||||
hasNoUpperMinus = new UnicodeSet(hasNoUpper).removeAll(wordChars);
|
||||
System.out.println(bf.showSetNames(hasNoUpper));
|
||||
|
||||
Utility.fixDot();
|
||||
PrintWriter htmlOut = BagFormatter.openUTF8Writer(GEN_DIR, "idn-chars.html");
|
||||
|
@ -245,6 +253,23 @@ public class TestData implements UCD_Types {
|
|||
showCodes(htmlOut, textOut, INHERITED_SCRIPT);
|
||||
htmlOut.println("</table></body></html>");
|
||||
htmlOut.close();
|
||||
bf.setMergeRanges(false);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** WORD CHARACTERS ADDED ***");
|
||||
bf.setValueSource("word-chars");
|
||||
bf.showSetNames(textOut, wordChars);
|
||||
|
||||
textOut.println();
|
||||
textOut.println("# *** FOR REVIEW (collected from above) ***");
|
||||
bf.setLabelSource(UnicodeLabel.NULL);
|
||||
for (Iterator it = new TreeSet(suspect.getAvailableValues()).iterator(); it.hasNext();) {
|
||||
textOut.println();
|
||||
String value = (String)it.next();
|
||||
bf.setValueSource(value);
|
||||
bf.showSetNames(textOut, suspect.getSet(value));
|
||||
}
|
||||
textOut.close();
|
||||
}
|
||||
|
||||
UnicodeSet idnaTypeSet[] = new UnicodeSet[IDNA_TYPE_LIMIT];
|
||||
|
@ -302,25 +327,38 @@ public class TestData implements UCD_Types {
|
|||
UnicodeSet illegal = extract(idnaTypeSet[ILLEGAL], core);
|
||||
UnicodeSet remapped = extract(idnaTypeSet[REMAPPED], core);
|
||||
|
||||
UnicodeSet remappedIsNFKC = extract(isNFKC, remapped);
|
||||
UnicodeSet remappedIsNFKCDecomp = extract(decomposable, remappedIsNFKC);
|
||||
|
||||
UnicodeSet decomp = extract(decomposable, core);
|
||||
UnicodeSet pattern = extract(patternProp, core);
|
||||
UnicodeSet non_id = extract(not_xid_continue, core);
|
||||
|
||||
UnicodeSet otherCore = new UnicodeSet(core).removeAll(hasUpper);
|
||||
core.removeAll(otherCore);
|
||||
if (core.size() == 0) {
|
||||
UnicodeSet temp = core;
|
||||
core = otherCore;
|
||||
otherCore = temp;
|
||||
UnicodeSet bicameralNoupper = new UnicodeSet();
|
||||
if (!hasNoUpper.containsAll(core)) {
|
||||
bicameralNoupper = extract(hasNoUpperMinus, core);
|
||||
}
|
||||
|
||||
UnicodeSet foo = new UnicodeSet(bicameralNoupper).addAll(non_id);
|
||||
for (UnicodeSetIterator it = new UnicodeSetIterator(foo); it.next(); ) {
|
||||
String cat = Default.ucd().getCategoryID(it.codepoint);
|
||||
String name = Default.ucd().getName(it.codepoint);
|
||||
if (name.indexOf("MUSICAL SYMBOL") >= 0
|
||||
|| name.indexOf("DINGBA") >= 0
|
||||
|| name.indexOf("RADICAL ") >= 0
|
||||
) cat = "XX";
|
||||
suspect.put(it.codepoint, cat);
|
||||
}
|
||||
|
||||
if (core.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic", core, scriptCode);
|
||||
if (otherCore.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", otherCore, scriptCode);
|
||||
if (bicameralNoupper.size() != 0) printlnSet(htmlOut, textOut, script, "Atomic-no-uppercase", bicameralNoupper, scriptCode);
|
||||
if (pattern.size() != 0) printlnSet(htmlOut, textOut, script, "Pattern_Syntax", pattern, scriptCode);
|
||||
if (non_id.size() != 0) printlnSet(htmlOut, textOut, script, "Non-XID", non_id, scriptCode);
|
||||
if (decomp.size() != 0) printlnSet(htmlOut, textOut, script, "Decomposable", decomp, scriptCode);
|
||||
|
||||
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped", remapped, scriptCode);
|
||||
if (remappedIsNFKC.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Atomic", remappedIsNFKC, scriptCode);
|
||||
if (remappedIsNFKCDecomp.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Case-Decomposable", remappedIsNFKCDecomp, scriptCode);
|
||||
if (remapped.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Remapped-Compat", remapped, scriptCode);
|
||||
if (deleted.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Deleted", deleted, scriptCode);
|
||||
if (illegal.size() != 0) printlnSet(htmlOut, textOut, script, "IDN-Illegal", illegal, scriptCode);
|
||||
}
|
||||
|
|
|
@ -14,7 +14,9 @@
|
|||
.Non-XID { background-color: #FFCCCC }
|
||||
.Decomposable { background-color: #FFFFCC }
|
||||
.Pattern_Syntax { background-color: #FFCCFF }
|
||||
.IDN-Remapped { background-color: #FF6666 }
|
||||
.IDN-Remapped-Case-Atomic { background-color: #CCFFFF }
|
||||
.IDN-Remapped-Case-Decomposable { background-color: #66FFFF }
|
||||
.IDN-Remapped-Compat { background-color: #FF6666 }
|
||||
.IDN-Deleted { background-color: #66FF66 }
|
||||
.IDN-Illegal { background-color: #6666FF }
|
||||
th { text-align: left }
|
||||
|
@ -25,7 +27,7 @@ th { text-align: left }
|
|||
<body style="margin: 2em">
|
||||
|
||||
<h1>IDN Character Categorization</h1>
|
||||
<p><i>$Date: 2005/03/30 17:19:32 $, MED</i></p>
|
||||
<p><i>$Date: 2005/04/06 08:48:17 $, MED</i></p>
|
||||
<p>This page lists all of the valid output IDN characters broken down by category. By "output" IDN
|
||||
characters, we mean ones that can result from nameprep. Characters are grouped first by script, and
|
||||
then by subcategory. Within each subcategory characters are sorted according to the default
|
||||
|
@ -69,8 +71,17 @@ and name (in enabled browsers).</p>
|
|||
<td>Characters with NFC decompositions.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Remapped">IDN-Remapped</td>
|
||||
<td>Characters remapped by IDN.</td>
|
||||
<td class="IDN-Remapped-Case-Atomic">IDN-Remapped</td>
|
||||
<td>Characters remapped by IDN due to case folding</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Remapped-Case-Decomposable">IDN-Remapped</td>
|
||||
<td>Characters remapped by IDN due to case folding, that are decomposable.</td>
|
||||
</tr>
|
||||
IDN-Remapped-Case-Decomposable
|
||||
<tr>
|
||||
<td class="IDN-Remapped-Compat">IDN-Remapped</td>
|
||||
<td>Characters remapped by IDN due to compatibility mapping.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td class="IDN-Deleted">IDN-Deleted</td>
|
||||
|
|
Loading…
Add table
Reference in a new issue