mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-0 updated for 4.1
X-SVN-Rev: 16940
This commit is contained in:
parent
3e4d1a861a
commit
3daf3898fb
10 changed files with 358 additions and 214 deletions
|
@ -15,6 +15,7 @@ public final class Default implements UCD_Types {
|
|||
private static Normalizer nfkc;
|
||||
private static Normalizer nfkd;
|
||||
private static Normalizer[] nf = new Normalizer[4];
|
||||
private static String year;
|
||||
|
||||
public static void setUCD(String version) {
|
||||
ucdVersion = version;
|
||||
|
@ -37,14 +38,21 @@ public final class Default implements UCD_Types {
|
|||
}
|
||||
|
||||
static DateFormat myDateFormat = new SimpleDateFormat("yyyy-MM-dd', 'HH:mm:ss' GMT'");
|
||||
static DateFormat yearFormat = new SimpleDateFormat("yyyy");
|
||||
|
||||
static {
|
||||
myDateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
|
||||
year = yearFormat.format(new Date());
|
||||
}
|
||||
|
||||
public static String getDate() {
|
||||
return myDateFormat.format(new Date());
|
||||
}
|
||||
|
||||
public static String getYear() {
|
||||
return year;
|
||||
}
|
||||
|
||||
public static String ucdVersion() {
|
||||
if (ucd == null) setUCD();
|
||||
return ucdVersion;
|
||||
|
@ -75,4 +83,11 @@ public final class Default implements UCD_Types {
|
|||
return nf[index];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param lineValue
|
||||
*/
|
||||
public static void setYear(String lineValue) {
|
||||
year = lineValue;
|
||||
}
|
||||
|
||||
}
|
|
@ -34,12 +34,13 @@ import com.ibm.icu.text.SymbolTable;
|
|||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeMatcher;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.UCD.MakeUnicodeFiles.Format.PrintStyle;
|
||||
import com.ibm.text.utility.UnicodeDataFile;
|
||||
import com.ibm.text.utility.Utility;
|
||||
import com.ibm.icu.text.Collator;
|
||||
|
||||
public class MakeUnicodeFiles {
|
||||
public static int dVersion = 6; // change to fix the generated file D version. If less than zero, no "d"
|
||||
public static int dVersion = -1; // change to fix the generated file D version. If less than zero, no "d"
|
||||
|
||||
/*static String[] hackNameList = {
|
||||
"noBreak", "Arabic_Presentation_Forms-A", "Arabic_Presentation_Forms-B",
|
||||
|
@ -62,8 +63,7 @@ public class MakeUnicodeFiles {
|
|||
static boolean DEBUG = false;
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
//generateFile();
|
||||
testInvariants();
|
||||
generateFile();
|
||||
}
|
||||
|
||||
static class Format {
|
||||
|
@ -321,6 +321,8 @@ public class MakeUnicodeFiles {
|
|||
}
|
||||
} else if (line.startsWith("DeltaVersion:")) {
|
||||
dVersion = Integer.parseInt(lineValue);
|
||||
} else if (line.startsWith("CopyrightYear:")) {
|
||||
Default.setYear(lineValue);
|
||||
} else if (line.startsWith("File:")) {
|
||||
int p2 = lineValue.lastIndexOf('/');
|
||||
file = lineValue.substring(p2+1);
|
||||
|
@ -758,7 +760,7 @@ public class MakeUnicodeFiles {
|
|||
else bf.setPropName(name);
|
||||
|
||||
if (ps.interleaveValues) {
|
||||
writeInterleavedValues(pw, bf, prop);
|
||||
writeInterleavedValues(pw, bf, prop, ps);
|
||||
} else if (prop.isType(UnicodeProperty.STRING_OR_MISC_MASK)) {
|
||||
writeStringValues(pw, bf, prop);
|
||||
//} else if (prop.isType(UnicodeProperty.BINARY_MASK)) {
|
||||
|
@ -904,10 +906,10 @@ public class MakeUnicodeFiles {
|
|||
private static void writeInterleavedValues(
|
||||
PrintWriter pw,
|
||||
BagFormatter bf,
|
||||
UnicodeProperty prop) {
|
||||
UnicodeProperty prop, PrintStyle ps) {
|
||||
if (DEBUG) System.out.println("Writing Interleaved Values: " + prop.getName());
|
||||
pw.println();
|
||||
bf.setValueSource(new UnicodeProperty.FilteredProperty(prop, new RestoreSpacesFilter()))
|
||||
bf.setValueSource(new UnicodeProperty.FilteredProperty(prop, new RestoreSpacesFilter(ps)))
|
||||
.setNameSource(null)
|
||||
.setLabelSource(null)
|
||||
.setRangeBreakSource(null)
|
||||
|
@ -945,10 +947,20 @@ public class MakeUnicodeFiles {
|
|||
}
|
||||
|
||||
static class RestoreSpacesFilter extends UnicodeProperty.StringFilter {
|
||||
public String remap(String original) {
|
||||
String skipValue;
|
||||
/**
|
||||
* @param ps
|
||||
*/
|
||||
public RestoreSpacesFilter(PrintStyle ps) {
|
||||
skipValue = ps.skipValue;
|
||||
if (skipValue == null) skipValue = ps.skipUnassigned;
|
||||
}
|
||||
|
||||
public String remap(String original) {
|
||||
// ok, because doesn't change length
|
||||
String mod = (String) Format.theFormat.hackMap.get(original);
|
||||
if (mod != null) original = mod;
|
||||
if (original.equals(skipValue)) return null;
|
||||
return original.replace('_',' ');
|
||||
}
|
||||
}
|
||||
|
@ -1147,185 +1159,7 @@ public class MakeUnicodeFiles {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Chain together several SymbolTables.
|
||||
* @author Davis
|
||||
*/
|
||||
static class ChainedSymbolTable implements SymbolTable {
|
||||
// TODO: add accessors?
|
||||
private List symbolTables;
|
||||
/**
|
||||
* Each SymbolTable is each accessed in order by the other methods,
|
||||
* so the first in the list is accessed first, etc.
|
||||
* @param symbolTables
|
||||
*/
|
||||
ChainedSymbolTable(SymbolTable[] symbolTables) {
|
||||
this.symbolTables = Arrays.asList(symbolTables);
|
||||
}
|
||||
public char[] lookup(String s) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
char[] result = st.lookup(s);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public UnicodeMatcher lookupMatcher(int ch) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
UnicodeMatcher result = st.lookupMatcher(ch);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Warning: this depends on pos being left alone unless a string is returned!!
|
||||
public String parseReference(String text, ParsePosition pos, int limit) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
String result = st.parseReference(text, pos, limit);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
|
||||
|
||||
public static void testInvariants() throws IOException {
|
||||
String[][] variables = new String[100][2];
|
||||
int variableCount = 0;
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
|
||||
BagFormatter bf = new BagFormatter();
|
||||
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
|
||||
ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
|
||||
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
|
||||
ParsePosition pp = new ParsePosition(0);
|
||||
int parseErrorCount = 0;
|
||||
int testFailureCount = 0;
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.startsWith("\uFEFF")) line = line.substring(1);
|
||||
out.println(line);
|
||||
line = line.trim();
|
||||
int pos = line.indexOf('#');
|
||||
if (pos >= 0) line = line.substring(0,pos).trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
// fix all the variables
|
||||
String oldLine = line;
|
||||
line = Utility.replace(line, variables, variableCount);
|
||||
|
||||
// detect variables
|
||||
if (line.startsWith("Let")) {
|
||||
int x = line.indexOf('=');
|
||||
variables[variableCount][0] = line.substring(3,x).trim();
|
||||
variables[variableCount][1] = line.substring(x+1).trim();
|
||||
variableCount++;
|
||||
System.out.println("Added variable: <" + variables[variableCount-1][0] + "><"
|
||||
+ variables[variableCount-1][1] + ">");
|
||||
continue;
|
||||
}
|
||||
|
||||
char relation = 0;
|
||||
String rightSide = null;
|
||||
String leftSide = null;
|
||||
UnicodeSet leftSet = null;
|
||||
UnicodeSet rightSet = null;
|
||||
try {
|
||||
pp.setIndex(0);
|
||||
leftSet = new UnicodeSet(line, pp, st);
|
||||
leftSide = line.substring(0,pp.getIndex());
|
||||
eatWhitespace(line, pp);
|
||||
relation = line.charAt(pp.getIndex());
|
||||
if (!INVARIANT_RELATIONS.contains(relation)) {
|
||||
throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false),
|
||||
pp.getIndex());
|
||||
}
|
||||
pp.setIndex(pp.getIndex()+1); // skip char
|
||||
eatWhitespace(line, pp);
|
||||
int start = pp.getIndex();
|
||||
rightSet = new UnicodeSet(line, pp, st);
|
||||
rightSide = line.substring(start,pp.getIndex());
|
||||
eatWhitespace(line, pp);
|
||||
if (line.length() != pp.getIndex()) {
|
||||
throw new ParseException("Extra characters at end", pp.getIndex());
|
||||
}
|
||||
} catch (ParseException e) {
|
||||
out.println("PARSE ERROR:\t" + line.substring(0,e.getErrorOffset())
|
||||
+ "<@>" + line.substring(e.getErrorOffset()));
|
||||
out.println();
|
||||
out.println("**** START Error Info ****");
|
||||
out.println(e.getMessage());
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
parseErrorCount++;
|
||||
continue;
|
||||
} catch (IllegalArgumentException e) {
|
||||
out.println("PARSE ERROR:\t" + line);
|
||||
out.println();
|
||||
out.println("**** START Error Info ****");
|
||||
out.println(e.getMessage());
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
parseErrorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean ok = true;
|
||||
switch(relation) {
|
||||
case '=': ok = leftSet.equals(rightSet); break;
|
||||
case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break;
|
||||
case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break;
|
||||
case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break;
|
||||
case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break;
|
||||
case '!': ok = leftSet.containsNone(rightSet); break;
|
||||
case '?': ok = !leftSet.equals(rightSet)
|
||||
&& !leftSet.containsAll(rightSet)
|
||||
&& !rightSet.containsAll(leftSet)
|
||||
&& !leftSet.containsNone(rightSet);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Internal Error");
|
||||
}
|
||||
if (ok) continue;
|
||||
out.println();
|
||||
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
|
||||
out.println("**** START Error Info ****");
|
||||
bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
testFailureCount++;
|
||||
}
|
||||
out.println();
|
||||
out.println("**** SUMMARY ****");
|
||||
out.println();
|
||||
out.println("ParseErrorCount=" + parseErrorCount);
|
||||
out.println("TestFailureCount=" + testFailureCount);
|
||||
out.close();
|
||||
System.out.println("ParseErrorCount=" + parseErrorCount);
|
||||
System.out.println("TestFailureCount=" + testFailureCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param line
|
||||
* @param pp
|
||||
*/
|
||||
private static void eatWhitespace(String line, ParsePosition pp) {
|
||||
int cp = 0;
|
||||
int i;
|
||||
for (i = pp.getIndex(); i < line.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(line, i);
|
||||
if (!com.ibm.icu.lang.UCharacter.isUWhiteSpace(cp)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
pp.setIndex(i);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
static class PropertySymbolTable implements SymbolTable {
|
||||
static boolean DEBUG = false;
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
Generate:
|
||||
DeltaVersion: 7
|
||||
DeltaVersion: 8
|
||||
CopyrightYear: 2005
|
||||
|
||||
File: GraphemeClusterBreakProperty
|
||||
File: uax29/GraphemeBreakProperty
|
||||
Property: Grapheme_Cluster_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
File: WordBreakProperty
|
||||
File: uax29/WordBreakProperty
|
||||
Property: Word_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
File: SentenceBreakProperty
|
||||
File: uax29/SentenceBreakProperty
|
||||
Property: Sentence_Break
|
||||
Format: skipValue=Other
|
||||
|
||||
|
@ -20,9 +21,7 @@ Property: Block
|
|||
# For example, "Latin Extended-A" and "latin extended a" are equivalent.
|
||||
# For more information on the comparison of property values,
|
||||
# see UCD.html.
|
||||
#
|
||||
# Code points not explicitly listed in this file are given the value No_Block.
|
||||
Format: valueList
|
||||
Format: valueList skipUnassigned=No_Block
|
||||
|
||||
File: CaseFolding
|
||||
Property: SPECIAL
|
||||
|
@ -363,7 +362,7 @@ HackName: Supplemental_Arrows-B
|
|||
HackName: Supplementary_Private_Use_Area-A
|
||||
HackName: Supplementary_Private_Use_Area-B
|
||||
HackName: Canadian-Aboriginal
|
||||
HackName: Old-Italic
|
||||
#HackName: Old-Italic
|
||||
|
||||
FinalComments
|
||||
Note that PropertyAliases sorts by the long name, while PropertyValueAliases
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/TestData.java,v $
|
||||
* $Date: 2004/02/18 03:09:00 $
|
||||
* $Revision: 1.15 $
|
||||
* $Date: 2004/12/11 06:03:08 $
|
||||
* $Revision: 1.16 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -137,8 +137,71 @@ public class TestData implements UCD_Types {
|
|||
log.close();
|
||||
}
|
||||
}
|
||||
|
||||
static PrintWriter log;
|
||||
|
||||
public static void checkShaping() throws IOException {
|
||||
log = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "checklog.txt");
|
||||
checkProperty("Joining_Type", "Non_Joining", "Joining_Type", "Transparent");
|
||||
checkProperty("Joining_Group", "No_Joining_Group", "Joining_Type", "Transparent");
|
||||
checkProperty("Line_Break", "Unknown", "Line_Break", "Combining_Mark");
|
||||
checkProperty("East_Asian_Width", null, "Line_Break", "Combining_Mark");
|
||||
checkProperty("Bidi_Class", null, "Line_Break", "Combining_Mark");
|
||||
checkProperty("Script", null, "Script", new String[]{"Common", "Inherited"});
|
||||
checkProperty("General_Category", null, "General_Category", new String[]{"Spacing_Mark",
|
||||
"Enclosing_Mark", "Nonspacing_Mark"});
|
||||
log.close();
|
||||
}
|
||||
|
||||
public static class RegexMatcher implements UnicodeProperty.Matcher {
|
||||
/**
|
||||
* @param propertyName
|
||||
* @param exclusion
|
||||
* @param ignorePropertyName TODO
|
||||
* @param ignoreValue
|
||||
*/
|
||||
private static void checkProperty(String propertyName, String exclusion, String ignorePropertyName, Object ignoreValueList) {
|
||||
log.println();
|
||||
log.println(propertyName + " Check");
|
||||
log.println();
|
||||
Set ignoreValueSet = new HashSet();
|
||||
if (ignoreValueList instanceof String) ignoreValueSet.add(ignoreValueList);
|
||||
else ignoreValueSet.addAll(Arrays.asList((Object[])ignoreValueList));
|
||||
|
||||
ToolUnicodePropertySource ups = ToolUnicodePropertySource.make("4.0.1");
|
||||
UnicodeProperty up = ups.getProperty(propertyName);
|
||||
UnicodeProperty ignProp = ups.getProperty(ignorePropertyName);
|
||||
UnicodeProperty name = ups.getProperty("Name");
|
||||
UnicodeSet significant = (exclusion != null ? up.getSet(exclusion) : new UnicodeSet()).complement();
|
||||
UnicodeSetIterator it = new UnicodeSetIterator(significant);
|
||||
Normalizer n = new Normalizer(Normalizer.NFD, "4.0.1");
|
||||
int counter = 0;
|
||||
while (it.next()) {
|
||||
String baseValue = up.getValue(it.codepoint);
|
||||
String nfd = n.normalize(it.codepoint);
|
||||
if (n.isNormalized(it.codepoint)) continue;
|
||||
//if (nfd.equals(it.getString())) continue;
|
||||
int cp;
|
||||
for (int i = 0; i < nfd.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(nfd, i);
|
||||
boolean shown = false;
|
||||
String newValue = up.getValue(cp);
|
||||
String possIgnValue = ignProp.getValue(cp);
|
||||
if (ignoreValueSet.contains(possIgnValue)) {
|
||||
//log.println("--- " + newValue + "\t" + Utility.hex(cp) + " " + name.getValue(cp));
|
||||
continue;
|
||||
}
|
||||
//log.println("*** " + newValue + "\t" + Utility.hex(cp) + " " + name.getValue(cp));
|
||||
|
||||
if (!baseValue.equals(newValue)) {
|
||||
if (!shown) log.println((++counter) + "\tCONFLICT\t" + baseValue + "\t" + Utility.hex(it.codepoint) + " " + name.getValue(it.codepoint));
|
||||
log.println("\tNFD(" + Utility.hex(it.codepoint) + ") contains:\t" + newValue + "\t" + Utility.hex(cp) + " " + name.getValue(cp));
|
||||
shown = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static class RegexMatcher implements UnicodeProperty.Matcher {
|
||||
private Matcher matcher;
|
||||
|
||||
public UnicodeProperty.Matcher set(String pattern) {
|
||||
|
|
203
tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java
Normal file
203
tools/unicodetools/com/ibm/text/UCD/TestUnicodeInvariants.java
Normal file
|
@ -0,0 +1,203 @@
|
|||
package com.ibm.text.UCD;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.text.ParseException;
|
||||
import java.text.ParsePosition;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.dev.test.util.BagFormatter;
|
||||
import com.ibm.icu.text.SymbolTable;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.UnicodeMatcher;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.text.utility.Utility;
|
||||
|
||||
public class TestUnicodeInvariants {
|
||||
|
||||
public static void main(String[] args) throws IOException {
|
||||
testInvariants();
|
||||
}
|
||||
|
||||
/**
|
||||
* Chain together several SymbolTables.
|
||||
* @author Davis
|
||||
*/
|
||||
static class ChainedSymbolTable implements SymbolTable {
|
||||
// TODO: add accessors?
|
||||
private List symbolTables;
|
||||
/**
|
||||
* Each SymbolTable is each accessed in order by the other methods,
|
||||
* so the first in the list is accessed first, etc.
|
||||
* @param symbolTables
|
||||
*/
|
||||
ChainedSymbolTable(SymbolTable[] symbolTables) {
|
||||
this.symbolTables = Arrays.asList(symbolTables);
|
||||
}
|
||||
public char[] lookup(String s) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
char[] result = st.lookup(s);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public UnicodeMatcher lookupMatcher(int ch) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
UnicodeMatcher result = st.lookupMatcher(ch);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Warning: this depends on pos being left alone unless a string is returned!!
|
||||
public String parseReference(String text, ParsePosition pos, int limit) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
String result = st.parseReference(text, pos, limit);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\~ \\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
|
||||
|
||||
public static void testInvariants() throws IOException {
|
||||
String[][] variables = new String[100][2];
|
||||
int variableCount = 0;
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
|
||||
BagFormatter bf = new BagFormatter();
|
||||
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
|
||||
ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
|
||||
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
|
||||
ParsePosition pp = new ParsePosition(0);
|
||||
int parseErrorCount = 0;
|
||||
int testFailureCount = 0;
|
||||
while (true) {
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.startsWith("\uFEFF")) line = line.substring(1);
|
||||
out.println(line);
|
||||
line = line.trim();
|
||||
int pos = line.indexOf('#');
|
||||
if (pos >= 0) line = line.substring(0,pos).trim();
|
||||
if (line.length() == 0) continue;
|
||||
|
||||
// fix all the variables
|
||||
String oldLine = line;
|
||||
line = Utility.replace(line, variables, variableCount);
|
||||
|
||||
// detect variables
|
||||
if (line.startsWith("Let")) {
|
||||
int x = line.indexOf('=');
|
||||
variables[variableCount][0] = line.substring(3,x).trim();
|
||||
variables[variableCount][1] = line.substring(x+1).trim();
|
||||
variableCount++;
|
||||
if (false) System.out.println("Added variable: <" + variables[variableCount-1][0] + "><"
|
||||
+ variables[variableCount-1][1] + ">");
|
||||
continue;
|
||||
}
|
||||
|
||||
char relation = 0;
|
||||
String rightSide = null;
|
||||
String leftSide = null;
|
||||
UnicodeSet leftSet = null;
|
||||
UnicodeSet rightSet = null;
|
||||
try {
|
||||
pp.setIndex(0);
|
||||
leftSet = new UnicodeSet(line, pp, st);
|
||||
leftSide = line.substring(0,pp.getIndex());
|
||||
eatWhitespace(line, pp);
|
||||
relation = line.charAt(pp.getIndex());
|
||||
if (!INVARIANT_RELATIONS.contains(relation)) {
|
||||
throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false),
|
||||
pp.getIndex());
|
||||
}
|
||||
pp.setIndex(pp.getIndex()+1); // skip char
|
||||
eatWhitespace(line, pp);
|
||||
int start = pp.getIndex();
|
||||
rightSet = new UnicodeSet(line, pp, st);
|
||||
rightSide = line.substring(start,pp.getIndex());
|
||||
eatWhitespace(line, pp);
|
||||
if (line.length() != pp.getIndex()) {
|
||||
throw new ParseException("Extra characters at end", pp.getIndex());
|
||||
}
|
||||
} catch (ParseException e) {
|
||||
out.println("PARSE ERROR:\t" + line.substring(0,e.getErrorOffset())
|
||||
+ "<@>" + line.substring(e.getErrorOffset()));
|
||||
out.println();
|
||||
out.println("**** START Error Info ****");
|
||||
out.println(e.getMessage());
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
parseErrorCount++;
|
||||
continue;
|
||||
} catch (IllegalArgumentException e) {
|
||||
out.println("PARSE ERROR:\t" + line);
|
||||
out.println();
|
||||
out.println("**** START Error Info ****");
|
||||
out.println(e.getMessage());
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
parseErrorCount++;
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean ok = true;
|
||||
switch(relation) {
|
||||
case '=': ok = leftSet.equals(rightSet); break;
|
||||
case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break;
|
||||
case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break;
|
||||
case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break;
|
||||
case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break;
|
||||
case '!': ok = leftSet.containsNone(rightSet); break;
|
||||
case '?': ok = !leftSet.equals(rightSet)
|
||||
&& !leftSet.containsAll(rightSet)
|
||||
&& !rightSet.containsAll(leftSet)
|
||||
&& !leftSet.containsNone(rightSet);
|
||||
break;
|
||||
default: throw new IllegalArgumentException("Internal Error");
|
||||
}
|
||||
if (ok) continue;
|
||||
out.println();
|
||||
out.println(String.valueOf(ok).toUpperCase(Locale.ENGLISH));
|
||||
out.println("**** START Error Info ****");
|
||||
bf.showSetDifferences(out, rightSide, rightSet, leftSide, leftSet);
|
||||
out.println("**** END Error Info ****");
|
||||
out.println();
|
||||
testFailureCount++;
|
||||
}
|
||||
out.println();
|
||||
out.println("**** SUMMARY ****");
|
||||
out.println();
|
||||
out.println("ParseErrorCount=" + parseErrorCount);
|
||||
out.println("TestFailureCount=" + testFailureCount);
|
||||
out.close();
|
||||
System.out.println("ParseErrorCount=" + parseErrorCount);
|
||||
System.out.println("TestFailureCount=" + testFailureCount);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param line
|
||||
* @param pp
|
||||
*/
|
||||
private static void eatWhitespace(String line, ParsePosition pp) {
|
||||
int cp = 0;
|
||||
int i;
|
||||
for (i = pp.getIndex(); i < line.length(); i += UTF16.getCharCount(cp)) {
|
||||
cp = UTF16.charAt(line, i);
|
||||
if (!com.ibm.icu.lang.UCharacter.isUWhiteSpace(cp)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
pp.setIndex(i);
|
||||
}
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD.java,v $
|
||||
* $Date: 2004/11/13 23:10:32 $
|
||||
* $Revision: 1.36 $
|
||||
* $Date: 2004/12/11 06:03:08 $
|
||||
* $Revision: 1.37 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -1094,6 +1094,7 @@ public final class UCD implements UCD_Types {
|
|||
if (ch <= 0x4DB5) return 0x3400;
|
||||
if (ch <= 0x4E00) return ch; // CJK Ideograph
|
||||
if (ch <= 0x9FA5) return 0x4E00;
|
||||
if (ch <= 0x9FBB && rCompositeVersion >= 0x40100) return 0x4E00;
|
||||
if (ch <= 0xAC00) return ch; // Hangul Syllable
|
||||
if (ch <= 0xD7A3) return 0xAC00;
|
||||
if (ch <= 0xD800) return ch; // Non Private Use High Surrogate
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Names.java,v $
|
||||
* $Date: 2004/11/13 23:10:32 $
|
||||
* $Revision: 1.29 $
|
||||
* $Date: 2004/12/11 06:03:08 $
|
||||
* $Revision: 1.30 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -318,7 +318,7 @@ final class UCD_Names implements UCD_Types {
|
|||
"BOPOMOFO", // BOPOMOFO
|
||||
"HAN", // HAN
|
||||
"YI", // YI
|
||||
"OLD-ITALIC",
|
||||
"OLD_ITALIC",
|
||||
"GOTHIC",
|
||||
"DESERET",
|
||||
"INHERITED", // nonspacing marks
|
||||
|
@ -335,6 +335,14 @@ final class UCD_Names implements UCD_Types {
|
|||
"CYPRIOT",
|
||||
"BRAILLE",
|
||||
"KATAKANA_OR_HIRAGANA",
|
||||
"BUGINESE",
|
||||
"COPTIC",
|
||||
"NEW_TAI_LUE",
|
||||
"GLAGOLITIC",
|
||||
"TIFINAGH",
|
||||
"SYLOTI_NAGRI",
|
||||
"OLD_PERSIAN",
|
||||
"KHAROSHTHI",
|
||||
|
||||
};
|
||||
|
||||
|
@ -403,6 +411,14 @@ final class UCD_Names implements UCD_Types {
|
|||
"Cprt",
|
||||
"Brai",
|
||||
"Hrkt",
|
||||
"Bugi",
|
||||
"Copt",
|
||||
"Talu",
|
||||
"Glag",
|
||||
"Tfng",
|
||||
"Sylo",
|
||||
"Xpeo",
|
||||
"Khar",
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/UCD_Types.java,v $
|
||||
* $Date: 2004/11/13 23:10:32 $
|
||||
* $Revision: 1.29 $
|
||||
* $Date: 2004/12/11 06:03:08 $
|
||||
* $Revision: 1.30 $
|
||||
*
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -390,7 +390,16 @@ public interface UCD_Types {
|
|||
CYPRIOT = 52,
|
||||
BRAILLE = 53,
|
||||
KATAKANA_OR_HIRAGANA = 54,
|
||||
LIMIT_SCRIPT = 55;
|
||||
BUGINESE = 55,
|
||||
COPTIC = 56,
|
||||
NEW_TAI_LUE = 57,
|
||||
GLAGOLITIC = 58,
|
||||
TIFINAGH = 59,
|
||||
SYLOTI_NAGRI = 60,
|
||||
OLD_PERSIAN = 61,
|
||||
KHAROSHTHI = 62,
|
||||
|
||||
LIMIT_SCRIPT = 63;
|
||||
|
||||
static final int
|
||||
UNKNOWN = 0,
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
|
||||
$GC:Zs ? $Name:«.*SPACE.*»
|
||||
|
||||
[$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126]
|
||||
# [$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126]
|
||||
|
||||
# Examples of parsing errors
|
||||
|
||||
|
@ -56,6 +56,10 @@ $Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
|
|||
$LB:OP = $GC:Ps
|
||||
$General_Category:Decimal_Number = $Numeric_Type:Decimal
|
||||
$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
|
||||
$Dash ⊃ [$GC:Pd]
|
||||
$Script:Common ! [$GC:Mn $GC:Me]
|
||||
$Script:Common ! [$Alphabetic - $Math]
|
||||
$Alphabetic ⊃ [$Uppercase $Lowercase]
|
||||
|
||||
# Comparisons across versions
|
||||
|
||||
|
@ -71,7 +75,7 @@ $Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic]
|
|||
$Lowercase = [$GC:Ll $Other_Lowercase]
|
||||
$Uppercase = [$GC:Lu $Other_Uppercase]
|
||||
$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start]
|
||||
$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc] $Other_ID_Continue
|
||||
$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc $Other_ID_Continue]
|
||||
$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]
|
||||
$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend]
|
||||
$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend]
|
||||
|
@ -87,8 +91,8 @@ $Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf
|
|||
$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
|
||||
|
||||
# Testing
|
||||
$script:greek = $×script:greek
|
||||
$gc:lm = $script:inherited
|
||||
# $script:greek = $×script:greek
|
||||
# $gc:lm = $script:inherited
|
||||
|
||||
# ===========================
|
||||
|
||||
|
@ -110,7 +114,7 @@ Let $ZWJ = [\u200D] # [\N{ZERO WIDTH JOINER}]
|
|||
Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
|
||||
Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
|
||||
Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark]
|
||||
Let $strange = [\u24B6-\u24CF]
|
||||
Let $strange = [\u24B6-\u24E9]
|
||||
|
||||
# Unassigned, Control, Format, Private_Use, Surrogate,
|
||||
# Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter,
|
||||
|
@ -122,9 +126,9 @@ Let $strange = [\u24B6-\u24CF]
|
|||
|
||||
# UTS Rules
|
||||
|
||||
Let $alpha = [$Alphabetic $Lowercase] # $Uppercase $ZWNJ $ZWJ]
|
||||
Let $alpha = [$Alphabetic $strange] # $Uppercase $ZWNJ $ZWJ]
|
||||
Let $lower = $Lowercase
|
||||
Let $upper = [$Uppercase - $strange]
|
||||
Let $upper = [$Uppercase]
|
||||
Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha]
|
||||
Let $digit = $gc:Decimal_Number
|
||||
Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both!
|
||||
|
@ -132,7 +136,7 @@ Let $alnum = [$alpha $digit]
|
|||
Let $space = $Whitespace
|
||||
Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]]
|
||||
Let $cntrl = $gc:Control
|
||||
Let $graph = [^$space $gc:Control $gc:Format $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ]
|
||||
Let $graph = [^$space $gc:Control $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ]
|
||||
Let $print = [$graph $blank - $cntrl]
|
||||
Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation]
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ public class UnicodeDataFile {
|
|||
result.out.println(generateDateLine());
|
||||
result.out.println("#");
|
||||
result.out.println("# Unicode Character Database");
|
||||
result.out.println("# Copyright (c) 1991-2004 Unicode, Inc.");
|
||||
result.out.println("# Copyright (c) 1991-" + Default.getYear() + " Unicode, Inc.");
|
||||
result.out.println(
|
||||
"# For terms of use, see http://www.unicode.org/terms_of_use.html");
|
||||
result.out.println("# For documentation, see UCD.html");
|
||||
|
|
Loading…
Add table
Reference in a new issue