mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
updated symbol table
X-SVN-Rev: 14929
This commit is contained in:
parent
2bf3e1a0f1
commit
6a4883a9a3
2 changed files with 118 additions and 8 deletions
|
@ -9,6 +9,7 @@ import java.lang.reflect.Field;
|
|||
import java.text.ParseException;
|
||||
import java.text.ParsePosition;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
|
@ -61,7 +62,7 @@ public class MakeUnicodeFiles {
|
|||
|
||||
public static void main(String[] args) throws IOException {
|
||||
//generateFile();
|
||||
testInvariants(ToolUnicodePropertySource.make(Default.ucdVersion()));
|
||||
testInvariants();
|
||||
}
|
||||
|
||||
static class Format {
|
||||
|
@ -1105,15 +1106,60 @@ public class MakeUnicodeFiles {
|
|||
}
|
||||
}
|
||||
|
||||
static Matcher invariantLine = Pattern.compile("([^=><!?])\\s*([=><!?])\\s*([^=><!?])").matcher("");
|
||||
/**
|
||||
* Chain together several SymbolTables.
|
||||
* @author Davis
|
||||
*/
|
||||
static class ChainedSymbolTable implements SymbolTable {
|
||||
// TODO: add accessors?
|
||||
private List symbolTables;
|
||||
/**
|
||||
* Each SymbolTable is each accessed in order by the other methods,
|
||||
* so the first in the list is accessed first, etc.
|
||||
* @param symbolTables
|
||||
*/
|
||||
ChainedSymbolTable(SymbolTable[] symbolTables) {
|
||||
this.symbolTables = Arrays.asList(symbolTables);
|
||||
}
|
||||
public char[] lookup(String s) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
char[] result = st.lookup(s);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public UnicodeMatcher lookupMatcher(int ch) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
UnicodeMatcher result = st.lookupMatcher(ch);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Warning: this depends on pos being left alone unless a string is returned!!
|
||||
public String parseReference(String text, ParsePosition pos, int limit) {
|
||||
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
|
||||
SymbolTable st = (SymbolTable) it.next();
|
||||
String result = st.parseReference(text, pos, limit);
|
||||
if (result != null) return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[=><!?]");
|
||||
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
|
||||
|
||||
static void testInvariants(UnicodeProperty.Factory factory) throws IOException {
|
||||
static void testInvariants() throws IOException {
|
||||
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
|
||||
out.write('\uFEFF'); // BOM
|
||||
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
|
||||
BagFormatter bf = new BagFormatter();
|
||||
SymbolTable st = factory.getSymbolTable();
|
||||
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
|
||||
ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
|
||||
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
|
||||
ParsePosition pp = new ParsePosition(0);
|
||||
int parseErrorCount = 0;
|
||||
int testFailureCount = 0;
|
||||
|
@ -1122,6 +1168,7 @@ public class MakeUnicodeFiles {
|
|||
String leftSide = null;
|
||||
String line = in.readLine();
|
||||
if (line == null) break;
|
||||
if (line.startsWith("\uFEFF")) line = line.substring(1);
|
||||
line = line.trim();
|
||||
int pos = line.indexOf('#');
|
||||
if (pos >= 0) line = line.substring(0,pos).trim();
|
||||
|
@ -1137,7 +1184,8 @@ public class MakeUnicodeFiles {
|
|||
eatWhitespace(line, pp);
|
||||
relation = line.charAt(pp.getIndex());
|
||||
if (!INVARIANT_RELATIONS.contains(relation)) {
|
||||
throw new ParseException("Invalid relation", pp.getIndex());
|
||||
throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false),
|
||||
pp.getIndex());
|
||||
}
|
||||
pp.setIndex(pp.getIndex()+1); // skip char
|
||||
eatWhitespace(line, pp);
|
||||
|
@ -1172,8 +1220,10 @@ public class MakeUnicodeFiles {
|
|||
boolean ok = true;
|
||||
switch(relation) {
|
||||
case '=': ok = leftSet.equals(rightSet); break;
|
||||
case '>': ok = leftSet.containsAll(rightSet); break;
|
||||
case '<': ok = rightSet.containsAll(leftSet); break;
|
||||
case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break;
|
||||
case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break;
|
||||
case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break;
|
||||
case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break;
|
||||
case '!': ok = leftSet.containsNone(rightSet); break;
|
||||
case '?': ok = !leftSet.equals(rightSet)
|
||||
&& !leftSet.containsAll(rightSet)
|
||||
|
|
60
tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt
Normal file
60
tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt
Normal file
|
@ -0,0 +1,60 @@
|
|||
# Invariance tests
|
||||
# Each line indicates an invariant set relationship to be tested,
|
||||
# and is of the form:
|
||||
#
|
||||
# line := set relation set
|
||||
#
|
||||
# relation := '=' // has identical contents to
|
||||
# := ('>' | '⊃') // is proper superset of
|
||||
# := ('≥' | '⊇') // is superset of
|
||||
# := ('<' | '⊂') // is proper subset of
|
||||
# := ('≤' | '⊆') // is subset of
|
||||
# := '!' // has no intersection
|
||||
# := '?' // none of the above (they overlap, and neither contains the other)
|
||||
#
|
||||
# A set is a standard UnicodeSet, but where $pv can be used to express properties
|
||||
#
|
||||
# pv := '$' '×'? prop (('=' | ':') value)?
|
||||
#
|
||||
# The × indicates that the property is the previous released version.
|
||||
# That is, if the version is 4.0.1, then the × version is 4.0.0
|
||||
# If the value is missing, it is defaulted to true
|
||||
# If the value is of the form «...», then the ... is interpreted as a regular expression
|
||||
# The property can be the short or long form as in the PropertyAliases.txt
|
||||
# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt
|
||||
#
|
||||
# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in
|
||||
# Perl or other regular-expression languages. Examples:
|
||||
# [$General_Category:Unassigned-[a-zA-Z]]
|
||||
# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html
|
||||
#
|
||||
# WARNING: do not use \p{...} or [:...:] syntax, since those will be
|
||||
# ICU's current version of properties, not the current snapshot's.
|
||||
# Use the $ notation for properties (listed above) instead.
|
||||
#
|
||||
# When this file is parsed, an error message may contain <@>
|
||||
# to indicate the location of an error in the input line.
|
||||
|
||||
# The following not very interesting, but show examples of use
|
||||
|
||||
#$GC:Zs ! $GC:Zp
|
||||
#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
|
||||
$GC:Zs ? $Name:«.*SPACE.*»
|
||||
|
||||
# Examples of parsing errors
|
||||
|
||||
# $LBA:Neutral = $GC:Zp # example of non-existant property
|
||||
# $LB:foo = $GC:Zp # example of non-existant value
|
||||
# $GC:Zs @ $GC:Zp # example of unknown relation
|
||||
|
||||
# The following should be real invariants
|
||||
# For illustration, different alias styles are used
|
||||
|
||||
$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
|
||||
$LB:OP = $GC:Ps
|
||||
$General_Category:Decimal_Number = $Numeric_Type:Decimal
|
||||
$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
|
||||
$ID_Start ⊇ $×ID_Start
|
||||
$ID_Continue ⊇ $×ID_Continue
|
||||
|
||||
|
Loading…
Add table
Reference in a new issue