updated symbol table

X-SVN-Rev: 14929
This commit is contained in:
Mark Davis 2004-04-12 01:45:24 +00:00
parent 2bf3e1a0f1
commit 6a4883a9a3
2 changed files with 118 additions and 8 deletions

View file

@ -9,6 +9,7 @@ import java.lang.reflect.Field;
import java.text.ParseException;
import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
@ -61,7 +62,7 @@ public class MakeUnicodeFiles {
public static void main(String[] args) throws IOException {
//generateFile();
testInvariants(ToolUnicodePropertySource.make(Default.ucdVersion()));
testInvariants();
}
static class Format {
@ -1105,15 +1106,60 @@ public class MakeUnicodeFiles {
}
}
static Matcher invariantLine = Pattern.compile("([^=><!?])\\s*([=><!?])\\s*([^=><!?])").matcher("");
/**
* Chain together several SymbolTables.
* @author Davis
*/
static class ChainedSymbolTable implements SymbolTable {
// TODO: add accessors?
private List symbolTables;
/**
* Each SymbolTable is each accessed in order by the other methods,
* so the first in the list is accessed first, etc.
* @param symbolTables
*/
ChainedSymbolTable(SymbolTable[] symbolTables) {
this.symbolTables = Arrays.asList(symbolTables);
}
public char[] lookup(String s) {
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
SymbolTable st = (SymbolTable) it.next();
char[] result = st.lookup(s);
if (result != null) return result;
}
return null;
}
public UnicodeMatcher lookupMatcher(int ch) {
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
SymbolTable st = (SymbolTable) it.next();
UnicodeMatcher result = st.lookupMatcher(ch);
if (result != null) return result;
}
return null;
}
// Warning: this depends on pos being left alone unless a string is returned!!
public String parseReference(String text, ParsePosition pos, int limit) {
for (Iterator it = symbolTables.iterator(); it.hasNext();) {
SymbolTable st = (SymbolTable) it.next();
String result = st.parseReference(text, pos, limit);
if (result != null) return result;
}
return null;
}
}
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[=><!?]");
static final UnicodeSet INVARIANT_RELATIONS = new UnicodeSet("[\\= \\! \\? \\< \\> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]");
static void testInvariants(UnicodeProperty.Factory factory) throws IOException {
static void testInvariants() throws IOException {
PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt");
out.write('\uFEFF'); // BOM
BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt");
BagFormatter bf = new BagFormatter();
SymbolTable st = factory.getSymbolTable();
ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] {
ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"),
ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")});
ParsePosition pp = new ParsePosition(0);
int parseErrorCount = 0;
int testFailureCount = 0;
@ -1122,6 +1168,7 @@ public class MakeUnicodeFiles {
String leftSide = null;
String line = in.readLine();
if (line == null) break;
if (line.startsWith("\uFEFF")) line = line.substring(1);
line = line.trim();
int pos = line.indexOf('#');
if (pos >= 0) line = line.substring(0,pos).trim();
@ -1137,7 +1184,8 @@ public class MakeUnicodeFiles {
eatWhitespace(line, pp);
relation = line.charAt(pp.getIndex());
if (!INVARIANT_RELATIONS.contains(relation)) {
throw new ParseException("Invalid relation", pp.getIndex());
throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false),
pp.getIndex());
}
pp.setIndex(pp.getIndex()+1); // skip char
eatWhitespace(line, pp);
@ -1172,8 +1220,10 @@ public class MakeUnicodeFiles {
boolean ok = true;
switch(relation) {
case '=': ok = leftSet.equals(rightSet); break;
case '>': ok = leftSet.containsAll(rightSet); break;
case '<': ok = rightSet.containsAll(leftSet); break;
case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break;
case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break;
case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break;
case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break;
case '!': ok = leftSet.containsNone(rightSet); break;
case '?': ok = !leftSet.equals(rightSet)
&& !leftSet.containsAll(rightSet)

View file

@ -0,0 +1,60 @@
# Invariance tests
# Each line indicates an invariant set relationship to be tested,
# and is of the form:
#
# line := set relation set
#
# relation := '=' // has identical contents to
# := ('>' | '⊃') // is proper superset of
# := ('≥' | '⊇') // is superset of
# := ('<' | '⊂') // is proper subset of
# := ('≤' | '⊆') // is subset of
# := '!' // has no intersection
# := '?' // none of the above (they overlap, and neither contains the other)
#
# A set is a standard UnicodeSet, but where $pv can be used to express properties
#
# pv := '$' '×'? prop (('=' | ':') value)?
#
# The × indicates that the property is the previous released version.
# That is, if the version is 4.0.1, then the × version is 4.0.0
# If the value is missing, it is defaulted to true
# If the value is of the form «...», then the ... is interpreted as a regular expression
# The property can be the short or long form as in the PropertyAliases.txt
# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt
#
# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in
# Perl or other regular-expression languages. Examples:
# [$General_Category:Unassigned-[a-zA-Z]]
# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html
#
# WARNING: do not use \p{...} or [:...:] syntax, since those will be
# ICU's current version of properties, not the current snapshot's.
# Use the $ notation for properties (listed above) instead.
#
# When this file is parsed, an error message may contain <@>
# to indicate the location of an error in the input line.
# The following not very interesting, but show examples of use
#$GC:Zs ! $GC:Zp
#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
$GC:Zs ? $Name:«.*SPACE.*»
# Examples of parsing errors
# $LBA:Neutral = $GC:Zp # example of non-existant property
# $LB:foo = $GC:Zp # example of non-existant value
# $GC:Zs @ $GC:Zp # example of unknown relation
# The following should be real invariants
# For illustration, different alias styles are used
$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
$LB:OP = $GC:Ps
$General_Category:Decimal_Number = $Numeric_Type:Decimal
$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
$ID_Start ⊇ $×ID_Start
$ID_Continue ⊇ $×ID_Continue