From 6a4883a9a3c3cd45bf2b12814ed8f690f439ea0e Mon Sep 17 00:00:00 2001 From: Mark Davis Date: Mon, 12 Apr 2004 01:45:24 +0000 Subject: [PATCH] updated symbol table X-SVN-Rev: 14929 --- .../com/ibm/text/UCD/MakeUnicodeFiles.java | 66 ++++++++++++++++--- .../com/ibm/text/UCD/UnicodeInvariants.txt | 60 +++++++++++++++++ 2 files changed, 118 insertions(+), 8 deletions(-) create mode 100644 tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt diff --git a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java index f380a83f591..f175b5e531e 100644 --- a/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java +++ b/tools/unicodetools/com/ibm/text/UCD/MakeUnicodeFiles.java @@ -9,6 +9,7 @@ import java.lang.reflect.Field; import java.text.ParseException; import java.text.ParsePosition; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Comparator; import java.util.HashMap; @@ -61,7 +62,7 @@ public class MakeUnicodeFiles { public static void main(String[] args) throws IOException { //generateFile(); - testInvariants(ToolUnicodePropertySource.make(Default.ucdVersion())); + testInvariants(); } static class Format { @@ -1105,15 +1106,60 @@ public class MakeUnicodeFiles { } } - static Matcher invariantLine = Pattern.compile("([^=> \u2264 \u2265 \u2282 \u2286 \u2283 \u2287]"); - static void testInvariants(UnicodeProperty.Factory factory) throws IOException { + static void testInvariants() throws IOException { PrintWriter out = BagFormatter.openUTF8Writer(UCD_Types.GEN_DIR, "UnicodeInvariantResults.txt"); + out.write('\uFEFF'); // BOM BufferedReader in = BagFormatter.openUTF8Reader("", "UnicodeInvariants.txt"); BagFormatter bf = new BagFormatter(); - SymbolTable st = factory.getSymbolTable(); + ChainedSymbolTable st = new ChainedSymbolTable(new SymbolTable[] { + ToolUnicodePropertySource.make("4.0.0").getSymbolTable("\u00D7"), + ToolUnicodePropertySource.make(Default.ucdVersion()).getSymbolTable("")}); ParsePosition pp = new ParsePosition(0); int parseErrorCount = 0; int testFailureCount = 0; @@ -1122,6 +1168,7 @@ public class MakeUnicodeFiles { String leftSide = null; String line = in.readLine(); if (line == null) break; + if (line.startsWith("\uFEFF")) line = line.substring(1); line = line.trim(); int pos = line.indexOf('#'); if (pos >= 0) line = line.substring(0,pos).trim(); @@ -1137,7 +1184,8 @@ public class MakeUnicodeFiles { eatWhitespace(line, pp); relation = line.charAt(pp.getIndex()); if (!INVARIANT_RELATIONS.contains(relation)) { - throw new ParseException("Invalid relation", pp.getIndex()); + throw new ParseException("Invalid relation, must be one of " + INVARIANT_RELATIONS.toPattern(false), + pp.getIndex()); } pp.setIndex(pp.getIndex()+1); // skip char eatWhitespace(line, pp); @@ -1172,8 +1220,10 @@ public class MakeUnicodeFiles { boolean ok = true; switch(relation) { case '=': ok = leftSet.equals(rightSet); break; - case '>': ok = leftSet.containsAll(rightSet); break; - case '<': ok = rightSet.containsAll(leftSet); break; + case '<': case '\u2282': ok = rightSet.containsAll(leftSet) && !leftSet.equals(rightSet); break; + case '>': case '\u2283': ok = leftSet.containsAll(rightSet) && !leftSet.equals(rightSet); break; + case '\u2264': case '\u2286': ok = rightSet.containsAll(leftSet); break; + case '\u2265': case '\u2287': ok = leftSet.containsAll(rightSet); break; case '!': ok = leftSet.containsNone(rightSet); break; case '?': ok = !leftSet.equals(rightSet) && !leftSet.containsAll(rightSet) diff --git a/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt new file mode 100644 index 00000000000..bc101f29ef0 --- /dev/null +++ b/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt @@ -0,0 +1,60 @@ +# Invariance tests +# Each line indicates an invariant set relationship to be tested, +# and is of the form: +# +# line := set relation set +# +# relation := '=' // has identical contents to +# := ('>' | '⊃') // is proper superset of +# := ('≥' | '⊇') // is superset of +# := ('<' | '⊂') // is proper subset of +# := ('≤' | '⊆') // is subset of +# := '!' // has no intersection +# := '?' // none of the above (they overlap, and neither contains the other) +# +# A set is a standard UnicodeSet, but where $pv can be used to express properties +# +# pv := '$' '×'? prop (('=' | ':') value)? +# +# The × indicates that the property is the previous released version. +# That is, if the version is 4.0.1, then the × version is 4.0.0 +# If the value is missing, it is defaulted to true +# If the value is of the form «...», then the ... is interpreted as a regular expression +# The property can be the short or long form as in the PropertyAliases.txt +# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt +# +# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in +# Perl or other regular-expression languages. Examples: +# [$General_Category:Unassigned-[a-zA-Z]] +# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html +# +# WARNING: do not use \p{...} or [:...:] syntax, since those will be +# ICU's current version of properties, not the current snapshot's. +# Use the $ notation for properties (listed above) instead. +# +# When this file is parsed, an error message may contain <@> +# to indicate the location of an error in the input line. + +# The following not very interesting, but show examples of use + +#$GC:Zs ! $GC:Zp +#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter +$GC:Zs ? $Name:«.*SPACE.*» + +# Examples of parsing errors + +# $LBA:Neutral = $GC:Zp # example of non-existant property +# $LB:foo = $GC:Zp # example of non-existant value +# $GC:Zs @ $GC:Zp # example of unknown relation + +# The following should be real invariants +# For illustration, different alias styles are used + +$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse] +$LB:OP = $GC:Ps +$General_Category:Decimal_Number = $Numeric_Type:Decimal +$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl] +$ID_Start ⊇ $×ID_Start +$ID_Continue ⊇ $×ID_Continue + +