From a4a66fdc7f876d32a01415c74673284f693ce374 Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Wed, 17 Oct 2001 19:19:00 +0000 Subject: [PATCH] ICU-86 initial implementation of perl-ish character property syntax for UnicodeSet X-SVN-Rev: 6280 --- .../dev/test/translit/TransliteratorTest.java | 19 +- .../icu/dev/test/translit/UnicodeSetTest.java | 18 +- icu4j/src/com/ibm/icu/text/Quantifier.java | 10 +- .../ibm/icu/text/TransliteratorParser.java | 54 +- .../com/ibm/icu/text/UnicodePropertySet.java | 590 ++++++++++++++++++ icu4j/src/com/ibm/icu/text/UnicodeSet.java | 431 ++++--------- .../ibm/test/translit/TransliteratorTest.java | 19 +- .../com/ibm/test/translit/UnicodeSetTest.java | 18 +- icu4j/src/com/ibm/text/Quantifier.java | 10 +- .../com/ibm/text/TransliteratorParser.java | 54 +- .../src/com/ibm/text/UnicodePropertySet.java | 590 ++++++++++++++++++ icu4j/src/com/ibm/text/UnicodeSet.java | 431 ++++--------- 12 files changed, 1590 insertions(+), 654 deletions(-) create mode 100755 icu4j/src/com/ibm/icu/text/UnicodePropertySet.java create mode 100755 icu4j/src/com/ibm/text/UnicodePropertySet.java diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java index ce3ec588a72..5e42d21a6ca 100755 --- a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java +++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $ - * $Date: 2001/10/10 20:23:27 $ - * $Revision: 1.52 $ + * $Date: 2001/10/17 19:19:00 $ + * $Revision: 1.53 $ * ***************************************************************************************** */ @@ -1318,7 +1318,7 @@ public class TransliteratorTest extends TestFmwk { public TestFact(String theID) { id = theID; } - public Transliterator getInstance() { + public Transliterator getInstance(String ignoredID) { return new NameableNullTrans(id); } }; @@ -1533,6 +1533,15 @@ public class TransliteratorTest extends TestFmwk { } } + /** + * Test new property set syntax + */ + public void TestPropertySet() { + expect("a>A; \\p{Lu}>x; \\p{ALL}>y;", "abcDEF", "Ayyxxx"); + expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9", + "[ a stitch ]\n[ in time ]\r[ saves 9]"); + } + //====================================================================== // icu4j ONLY // These tests are not mirrored (yet) in icu4c at @@ -1551,6 +1560,10 @@ public class TransliteratorTest extends TestFmwk { } } + public void TestDebugIndic() { + expect("'-'h\\u0323>a;", "-h\u0323", "a"); + } + //====================================================================== // Ram's tests //====================================================================== diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java index 0bf73ddf05f..e8cb9fbc859 100755 --- a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java +++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java,v $ - * $Date: 2001/10/10 21:35:33 $ - * $Revision: 1.13 $ + * $Date: 2001/10/17 19:17:59 $ + * $Revision: 1.14 $ * ***************************************************************************************** */ @@ -52,7 +52,7 @@ public class UnicodeSetTest extends TestFmwk { // not used int TOP = 0x200; // Don't need to go over the whole range: set = new UnicodeSet("[:L:]"); for (int i=0; i<0x200; ++i) { - boolean l = Character.isLetter((char)i); + boolean l = UCharacter.isLetter(i); if (l != set.contains((char)i)) { errln("FAIL: L contains " + (char)i + " = " + set.contains((char)i)); @@ -62,7 +62,7 @@ public class UnicodeSetTest extends TestFmwk { set = new UnicodeSet("[:Lu:]"); for (int i=0; i<0x200; ++i) { - boolean lu = (Character.getType((char)i) == Character.UPPERCASE_LETTER); + boolean lu = (UCharacter.getType(i) == UCharacterCategory.UPPERCASE_LETTER); if (lu != set.contains((char)i)) { errln("FAIL: Lu contains " + (char)i + " = " + set.contains((char)i)); @@ -249,11 +249,13 @@ public class UnicodeSetTest extends TestFmwk { /** * Test the [:Latin:] syntax. */ - public void TestScriptSet() { + public void TestPropertySet() { UnicodeSet set = new UnicodeSet("[:Latin:]"); expectContainment(set, "aA", "\u0391\u03B1"); - set = new UnicodeSet("[:Greek:]"); + set = new UnicodeSet("[\\p{Greek}]"); expectContainment(set, "\u0391\u03B1", "aA"); + set = new UnicodeSet("\\P{ GENERAL Category = upper case letter }"); + expectContainment(set, "abc", "ABC"); } /** @@ -453,7 +455,7 @@ public class UnicodeSetTest extends TestFmwk { } } if (bad.length() > 0) { - logln(Utility.escape("Fail: set " + set + " does not contain " + bad + + errln(Utility.escape("FAIL: set " + set + " does not contain " + bad + ", expected containment of " + charsIn)); } else { logln(Utility.escape("Ok: set " + set + " contains " + charsIn)); @@ -468,7 +470,7 @@ public class UnicodeSetTest extends TestFmwk { } } if (bad.length() > 0) { - logln(Utility.escape("Fail: set " + set + " contains " + bad + + errln(Utility.escape("FAIL: set " + set + " contains " + bad + ", expected non-containment of " + charsOut)); } else { logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut)); diff --git a/icu4j/src/com/ibm/icu/text/Quantifier.java b/icu4j/src/com/ibm/icu/text/Quantifier.java index f663a75f314..6966c6aec0d 100755 --- a/icu4j/src/com/ibm/icu/text/Quantifier.java +++ b/icu4j/src/com/ibm/icu/text/Quantifier.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Quantifier.java,v $ - * $Date: 2001/10/04 18:24:15 $ - * $Revision: 1.1 $ + * $Date: 2001/10/17 19:17:06 $ + * $Revision: 1.2 $ * ***************************************************************************************** */ @@ -45,9 +45,15 @@ class Quantifier implements UnicodeMatcher { int start = offset[0]; int count = 0; while (count < maxCount) { + int pos = offset[0]; int m = matcher.matches(text, offset, limit, incremental); if (m == U_MATCH) { ++count; + if (pos == offset[0]) { + // If offset has not moved we have a zero-width match. + // Don't keep matching it infinitely. + break; + } } else if (incremental && m == U_PARTIAL_MATCH) { return U_PARTIAL_MATCH; } else { diff --git a/icu4j/src/com/ibm/icu/text/TransliteratorParser.java b/icu4j/src/com/ibm/icu/text/TransliteratorParser.java index 096cfaf44dc..316b347b490 100755 --- a/icu4j/src/com/ibm/icu/text/TransliteratorParser.java +++ b/icu4j/src/com/ibm/icu/text/TransliteratorParser.java @@ -1,3 +1,13 @@ +/* +********************************************************************** +* Copyright (c) 2001, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $ +* $Date: 2001/10/17 19:17:06 $ +* $Revision: 1.4 $ +********************************************************************** +*/ package com.ibm.text; import com.ibm.text.resources.ResourceReader; @@ -85,6 +95,13 @@ class TransliteratorParser { */ private String undefinedVariableName; + /** + * The stand-in character for the 'dot' set, represented by '.' in + * patterns. This is allocated the first time it is needed, and + * reused thereafter. + */ + private int dotStandIn = -1; + //---------------------------------------------------------------------- // Constants //---------------------------------------------------------------------- @@ -109,8 +126,6 @@ class TransliteratorParser { private static final char CONTEXT_ANTE = '{'; // ante{key private static final char CONTEXT_POST = '}'; // key}post - private static final char SET_OPEN = '['; - private static final char SET_CLOSE = ']'; private static final char CURSOR_POS = '|'; private static final char CURSOR_OFFSET = '@'; private static final char ANCHOR_START = '^'; @@ -119,6 +134,9 @@ class TransliteratorParser { private static final char ONE_OR_MORE = '+'; private static final char ZERO_OR_ONE = '?'; + private static final char DOT = '.'; + private static final String DOT_SET = "[^[:Zp:][:Zl:]\r\n$]"; + // By definition, the ANCHOR_END special character is a // trailing SymbolTable.SYMBOL_REF character. // private static final char ANCHOR_END = '$'; @@ -541,6 +559,15 @@ class TransliteratorParser { // Text after a presumed end anchor is a syntax err syntaxError("Malformed variable reference", rule, start); } + if (UnicodeSet.resemblesPattern(rule, pos-1)) { + if (pp == null) { + pp = new ParsePosition(0); + } + pp.setIndex(pos-1); // Backup to opening '[' + buf.append(parser.parseSet(rule, pp)); + pos = pp.getIndex(); + continue; + } // Handle escapes if (c == ESCAPE) { if (pos == limit) { @@ -682,14 +709,6 @@ class TransliteratorParser { } post = buf.length(); break; - case SET_OPEN: - if (pp == null) { - pp = new ParsePosition(0); - } - pp.setIndex(pos-1); // Backup to opening '[' - buf.append(parser.parseSet(rule, pp)); - pos = pp.getIndex(); - break; case CURSOR_POS: if (cursor >= 0) { syntaxError("Multiple cursors", rule, start); @@ -718,6 +737,9 @@ class TransliteratorParser { } } break; + case DOT: + buf.append(parser.getDotStandIn()); + break; case KLEENE_STAR: case ONE_OR_MORE: case ZERO_OR_ONE: @@ -783,7 +805,6 @@ class TransliteratorParser { buf.append(parser.generateStandInFor(m)); } break; - // case SET_CLOSE: default: // Disallow unquoted characters other than [0-9A-Za-z] // in the printable ASCII range. These characters are @@ -1357,6 +1378,17 @@ class TransliteratorParser { return variableNext++; } + /** + * Return the stand-in for the dot set. It is allocated the first + * time and reused thereafter. + */ + char getDotStandIn() { + if (dotStandIn == -1) { + dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET)); + } + return (char) dotStandIn; + } + /** * Append the value of the given variable name to the given * StringBuffer. diff --git a/icu4j/src/com/ibm/icu/text/UnicodePropertySet.java b/icu4j/src/com/ibm/icu/text/UnicodePropertySet.java new file mode 100755 index 00000000000..7fbbf6aa61f --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/UnicodePropertySet.java @@ -0,0 +1,590 @@ +/* +********************************************************************** +* Copyright (c) 2001, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UnicodePropertySet.java,v $ +* $Date: 2001/10/17 19:17:06 $ +* $Revision: 1.1 $ +********************************************************************** +*/ +package com.ibm.text; + +import java.text.*; +import java.util.*; +import com.ibm.util.Utility; + +/** + * INTERNAL CLASS implementing the UnicodeSet properties as outlined + * at: + * + * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html + * + * Recognized syntax: + * + * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]" + * \p{foo} \P{foo} - white space not allowed within "\p" or "\P" + * + * Other than the above restrictions, white space is ignored. Case + * is ignored except in "\p" and "\P". + * + * This class cannot be instantiated. It has a public static method, + * createPropertySet(), with takes a pattern to be parsed and returns + * a new UnicodeSet. Another public static method, + * resemblesPattern(), returns true if a given pattern string appears + * to be a property set pattern, and therefore should be passed in to + * createPropertySet(). + * + * NOTE: Current implementation is incomplete. The following list + * indicates which properties are supported. + * + * + GeneralCategory + * CombiningClass + * BidiClass + * DecompositionType + * + NumericValue + * NumericType + * EastAsianWidth + * LineBreak + * JoiningType + * + Script + * + * '+' indicates a supported property. + * + * @author Alan Liu + * @version $RCSfile: UnicodePropertySet.java,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:17:06 $ + */ +class UnicodePropertySet { + + private static final Hashtable NAME_MAP = new Hashtable(); + + private static final Hashtable CATEGORY_MAP = new Hashtable(); + + /** + * A cache mapping character category integers, as returned by + * UCharacter.getType(), to sets. Entries are initially + * null and are created on demand. + */ + private static final UnicodeSet[] CATEGORY_CACHE = + new UnicodeSet[UCharacterCategory.CHAR_CATEGORY_COUNT]; + + /** + * A cache mapping script integers, as defined by + * UScript, to sets. Entries are initially + * null and are created on demand. + */ + private static final UnicodeSet[] SCRIPT_CACHE = + new UnicodeSet[UScript.CODE_LIMIT]; + + // Special value codes + private static final int ANY = -1; // general category: all code points + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + + /** + * Return true if the given position, in the given pattern, appears + * to be the start of a property set pattern [:foo:], \p{foo}, or + * \P{foo}. + */ + public static boolean resemblesPattern(String pattern, int pos) { + // Patterns are at least 5 characters long + if ((pos+5) > pattern.length()) { + return false; + } + + // Look for an opening [:, [:^, \p, or \P + return pattern.regionMatches(pos, "[:", 0, 2) || + pattern.regionMatches(true, pos, "\\p", 0, 2); + } + + /** + * Create a UnicodeSet by parsing the given pattern at the given + * parse position. + * + * @param pattern the pattern string + * @param ppos on entry, the position at which to begin parsing. + * This shold be one of the locations marked '^': + * + * [:blah:] \p{blah} \P{blah} + * ^ % ^ % ^ % + * + * On return, the position after the last character parsed, that is, + * the locations marked '%'. If the parse fails, ppos is returned + * unchanged. + * @return a newly-constructed UnicodeSet object, or null upon + * failure. + */ + public static UnicodeSet createFromPattern(String pattern, ParsePosition ppos) { + + UnicodeSet set = null; + + int pos = ppos.getIndex(); + + // On entry, ppos should point to one of the following locations: + + // Minimum length is 5 characters, e.g. \p{L} + if ((pos+5) > pattern.length()) { + return null; + } + + boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} + boolean invert = false; + + // Look for an opening [:, [:^, \p, or \P + if (pattern.regionMatches(pos, "[:", 0, 2)) { + posix = true; + pos = skipWhitespace(pattern, pos+2); + if (pos < pattern.length() && pattern.charAt(pos) == '^') { + ++pos; + invert = true; + } + } else if (pattern.regionMatches(true, pos, "\\p", 0, 2)) { + invert = (pattern.charAt(pos+1) == 'P'); + pos = skipWhitespace(pattern, pos+2); + if (pos == pattern.length() || pattern.charAt(pos++) != '{') { + // Syntax error; "\p" or "\P" not followed by "{" + return null; + } + } else { + // Open delimiter not seen + return null; + } + + // Look for the matching close delimiter, either :] or } + int close = pattern.indexOf(posix ? ":]" : "}", pos); + if (close < 0) { + // Syntax error; close delimiter missing + return null; + } + + // Look for an '=' sign. If this is present, we will parse a + // medium \p{gc=Cf} or long \p{GeneralCategory=Format} + // pattern. + int equals = pattern.indexOf('=', pos); + if (equals >= 0 && equals < close) { + // Equals seen; parse medium/long pattern + String typeName = munge(pattern, pos, equals); + String valueName = munge(pattern, equals+1, close); + SetFactory factory; + factory = (SetFactory) NAME_MAP.get(typeName); + if (factory == null) { + // Syntax error; type name not recognized + return null; + } + set = factory.create(valueName); + } else { + // No equals seen; parse short format \p{Cf} + String shortName = munge(pattern, pos, close); + + // First try general category + set = createCategorySet(shortName); + + // If this fails, try script + if (set == null) { + set = createScriptSet(shortName); + } + } + + if (invert) { + set.complement(); + } + + // Move to the limit position after the close delimiter + ppos.setIndex(close + (posix ? 2 : 1)); + + return set; + } + + //---------------------------------------------------------------- + // Property set factory classes + // NOTE: This will change/go away when we implement UCharacter + // based property retrieval. + //---------------------------------------------------------------- + + static interface SetFactory { + + UnicodeSet create(String valueName); + } + + static class NumericValueFactory implements SetFactory { + NumericValueFactory() {} + public UnicodeSet create(String valueName) { + double value = Double.parseDouble(valueName); + final int ivalue = (int) value; + if (ivalue != value || ivalue < 0) { + // UCharacter doesn't support negative or non-integral + // values, so just return an empty set + return new UnicodeSet(); + } + return createSetFromFilter(new Filter() { + public boolean contains(int cp) { + return UCharacter.getUnicodeNumericValue(cp) == ivalue; + } + }); + } + } + + //---------------------------------------------------------------- + // Property set factory static methods + // NOTE: This will change/go away when we implement UCharacter + // based property retrieval. + //---------------------------------------------------------------- + + /** + * Given a general category value name, create a corresponding + * set and return it, or return null if the name is invalid. + * @param valueName a pre-munged general category value name + */ + private static UnicodeSet createCategorySet(String valueName) { + Integer valueObj; + valueObj = (Integer) CATEGORY_MAP.get(valueName); + if (valueObj == null) { + return null; + } + int valueCode = valueObj.intValue(); + + UnicodeSet set = new UnicodeSet(); + if (valueCode == ANY) { + set.complement(); + return set; + } + for (int cat=0; cat= 0) { + set.add(start, end); + } + start = end = i; + } + } + } + if (start >= 0) { + set.add(start, end); + } + return set; + } + + //---------------------------------------------------------------- + // Type and value name maps + //---------------------------------------------------------------- + + /** + * Add a type mapping to the name map. + */ + private static void addType(String shortName, String longName, + SetFactory factory) { + // DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD + if (true) { + if (NAME_MAP.get(shortName) != null) { + throw new InternalError("Duplicate name " + shortName); + } + if (NAME_MAP.get(longName) != null) { + throw new InternalError("Duplicate name " + longName); + } + } + + NAME_MAP.put(shortName, factory); + NAME_MAP.put(longName, factory); + } + + /** + * Add a value mapping to the name map. + */ + private static void addValue(Hashtable map, + String shortName, String longName, + int value) { + // DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD + if (true) { + if (map.get(shortName) != null) { + throw new InternalError("Duplicate name " + shortName); + } + if (longName != null && map.get(longName) != null) { + throw new InternalError("Duplicate name " + longName); + } + } + + Integer valueObj = new Integer(value); + map.put(shortName, valueObj); + if (longName != null) { + map.put(longName, valueObj); + } + } + + static { + // NOTE: We munge all search keys to have no whitespace + // and upper case. As such, all stored keys should have + // this format. + + // Load the map with type data + + addType("GC", "GENERALCATEGORY", new SetFactory() { + public UnicodeSet create(String valueName) { + return createCategorySet(valueName); + } + }); + + //addType("CC", "COMBININGCLASS", COMBINING_CLASS); + //addType("BC", "BIDICLASS", BIDI_CLASS); + //addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE); + + addType("NV", "NUMERICVALUE", new NumericValueFactory()); + + //addType("NT", "NUMERICTYPE", NUMERIC_TYPE); + //addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH); + //addType("LB", "LINEBREAK", LINE_BREAK); + //addType("JT", "JOININGTYPE", JOINING_TYPE); + + addType("SC", "SCRIPT", new SetFactory() { + public UnicodeSet create(String valueName) { + return createScriptSet(valueName); + } + }); + + // Load the map with value data + + // General Category + + addValue(CATEGORY_MAP, "ANY", null, ANY); // special case + + addValue(CATEGORY_MAP, "C", "OTHER", + (1 << UCharacterCategory.CONTROL) | + (1 << UCharacterCategory.FORMAT) | + (1 << UCharacterCategory.GENERAL_OTHER_TYPES) | + (1 << UCharacterCategory.PRIVATE_USE) | + (1 << UCharacterCategory.SURROGATE)); + + addValue(CATEGORY_MAP, "CC", "CONTROL", + 1 << UCharacterCategory.CONTROL); + addValue(CATEGORY_MAP, "CF", "FORMAT", + 1 << UCharacterCategory.FORMAT); + addValue(CATEGORY_MAP, "CN", "UNASSIGNED", + 1 << UCharacterCategory.GENERAL_OTHER_TYPES); + addValue(CATEGORY_MAP, "CO", "PRIVATEUSE", + 1 << UCharacterCategory.PRIVATE_USE); + addValue(CATEGORY_MAP, "CS", "SURROGATE", + 1 << UCharacterCategory.SURROGATE); + + addValue(CATEGORY_MAP, "L", "LETTER", + (1 << UCharacterCategory.LOWERCASE_LETTER) | + (1 << UCharacterCategory.MODIFIER_LETTER) | + (1 << UCharacterCategory.OTHER_LETTER) | + (1 << UCharacterCategory.TITLECASE_LETTER) | + (1 << UCharacterCategory.UPPERCASE_LETTER)); + + addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER", + 1 << UCharacterCategory.LOWERCASE_LETTER); + addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER", + 1 << UCharacterCategory.MODIFIER_LETTER); + addValue(CATEGORY_MAP, "LO", "OTHERLETTER", + 1 << UCharacterCategory.OTHER_LETTER); + addValue(CATEGORY_MAP, "LT", "TITLECASELETTER", + 1 << UCharacterCategory.TITLECASE_LETTER); + addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER", + 1 << UCharacterCategory.UPPERCASE_LETTER); + + addValue(CATEGORY_MAP, "M", "MARK", + (1 << UCharacterCategory.NON_SPACING_MARK) | + (1 << UCharacterCategory.COMBINING_SPACING_MARK) | + (1 << UCharacterCategory.ENCLOSING_MARK)); + + addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK", + 1 << UCharacterCategory.NON_SPACING_MARK); + addValue(CATEGORY_MAP, "MC", "SPACINGMARK", + 1 << UCharacterCategory.COMBINING_SPACING_MARK); + addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK", + 1 << UCharacterCategory.ENCLOSING_MARK); + + addValue(CATEGORY_MAP, "N", "NUMBER", + (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) | + (1 << UCharacterCategory.LETTER_NUMBER) | + (1 << UCharacterCategory.OTHER_NUMBER)); + + addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER", + 1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER); + addValue(CATEGORY_MAP, "NL", "LETTERNUMBER", + 1 << UCharacterCategory.LETTER_NUMBER); + addValue(CATEGORY_MAP, "NO", "OTHERNUMBER", + 1 << UCharacterCategory.OTHER_NUMBER); + + addValue(CATEGORY_MAP, "P", "PUNCTUATION", + (1 << UCharacterCategory.CONNECTOR_PUNCTUATION) | + (1 << UCharacterCategory.DASH_PUNCTUATION) | + (1 << UCharacterCategory.END_PUNCTUATION) | + (1 << UCharacterCategory.FINAL_PUNCTUATION) | + (1 << UCharacterCategory.INITIAL_PUNCTUATION) | + (1 << UCharacterCategory.OTHER_PUNCTUATION) | + (1 << UCharacterCategory.START_PUNCTUATION)); + + addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION", + 1 << UCharacterCategory.CONNECTOR_PUNCTUATION); + addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION", + 1 << UCharacterCategory.DASH_PUNCTUATION); + addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION", + 1 << UCharacterCategory.END_PUNCTUATION); + addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION", + 1 << UCharacterCategory.FINAL_PUNCTUATION); + addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION", + 1 << UCharacterCategory.INITIAL_PUNCTUATION); + addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION", + 1 << UCharacterCategory.OTHER_PUNCTUATION); + addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION", + 1 << UCharacterCategory.START_PUNCTUATION); + + addValue(CATEGORY_MAP, "S", "SYMBOL", + (1 << UCharacterCategory.CURRENCY_SYMBOL) | + (1 << UCharacterCategory.MODIFIER_SYMBOL) | + (1 << UCharacterCategory.MATH_SYMBOL) | + (1 << UCharacterCategory.OTHER_SYMBOL)); + + addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL", + 1 << UCharacterCategory.CURRENCY_SYMBOL); + addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL", + 1 << UCharacterCategory.MODIFIER_SYMBOL); + addValue(CATEGORY_MAP, "SM", "MATHSYMBOL", + 1 << UCharacterCategory.MATH_SYMBOL); + addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL", + 1 << UCharacterCategory.OTHER_SYMBOL); + + addValue(CATEGORY_MAP, "Z", "SEPARATOR", + (1 << UCharacterCategory.LINE_SEPARATOR) | + (1 << UCharacterCategory.PARAGRAPH_SEPARATOR) | + (1 << UCharacterCategory.SPACE_SEPARATOR)); + + addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR", + 1 << UCharacterCategory.LINE_SEPARATOR); + addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR", + 1 << UCharacterCategory.PARAGRAPH_SEPARATOR); + addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR", + 1 << UCharacterCategory.SPACE_SEPARATOR); + } +} diff --git a/icu4j/src/com/ibm/icu/text/UnicodeSet.java b/icu4j/src/com/ibm/icu/text/UnicodeSet.java index 7e2c1f41449..6be4481767b 100755 --- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java +++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $ - * $Date: 2001/10/10 21:35:05 $ - * $Revision: 1.39 $ + * $Date: 2001/10/17 19:17:06 $ + * $Revision: 1.40 $ * ***************************************************************************************** */ @@ -202,60 +202,26 @@ import com.ibm.util.Utility; * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]. * * - *

Character categories. + *

Character properties. * - * Character categories are specified using the POSIX-like syntax - * '[:Lu:]'. The complement of a category is specified by inserting - * '^' after the opening '[:'. The following category names are - * recognized. Actual determination of category data uses - * Character.getType(), so it reflects the underlying - * implmementation used by Character. As of Java 2 and - * JDK 1.1.8, this is Unicode 2.1.2. + *

Character properties are specified using the POSIX-like syntax + * "[:Lu:]" or the Perl-like syntax "\p{Lu}". The complement of a + * category is specified as "[:^Lu:]" or "\P{Lu}". Actual + * determination of category data is accomplished by UCharacter using + * the underlying Unicode database. * - *

- * Normative
- *     Mn = Mark, Non-Spacing
- *     Mc = Mark, Spacing Combining
- *     Me = Mark, Enclosing
+ * 

For details of the property syntax please see this + * + * draft document. * - * Nd = Number, Decimal Digit - * Nl = Number, Letter - * No = Number, Other - * - * Zs = Separator, Space - * Zl = Separator, Line - * Zp = Separator, Paragraph - * - * Cc = Other, Control - * Cf = Other, Format - * Cs = Other, Surrogate - * Co = Other, Private Use - * Cn = Other, Not Assigned - * - * Informative - * Lu = Letter, Uppercase - * Ll = Letter, Lowercase - * Lt = Letter, Titlecase - * Lm = Letter, Modifier - * Lo = Letter, Other - * - * Pc = Punctuation, Connector - * Pd = Punctuation, Dash - * Ps = Punctuation, Open - * Pe = Punctuation, Close - * *Pi = Punctuation, Initial quote - * *Pf = Punctuation, Final quote - * Po = Punctuation, Other - * - * Sm = Symbol, Math - * Sc = Symbol, Currency - * Sk = Symbol, Modifier - * So = Symbol, Other - *

- * *Unsupported by Java (and hence unsupported by UnicodeSet). + *

Note: Not all properties are currently supported. + * Currently, only the general category, script, and numeric value + * properties are supported. Support for other properties will be + * added in the future. * * @author Alan Liu - * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.39 $ $Date: 2001/10/10 21:35:05 $ */ + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.40 $ $Date: 2001/10/17 19:17:06 $ + */ public class UnicodeSet extends UnicodeFilter { /* Implementation Notes. @@ -313,29 +279,12 @@ public class UnicodeSet extends UnicodeFilter { * modified using the non-pattern API, this string will be null, * indicating that toPattern() must generate a pattern * representation from the inversion list. - */ + */ private String pat = null; private static final int START_EXTRA = 16; // initial storage. Must be >= 0 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 - private static final String CATEGORY_NAMES = - // 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 - //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8 - "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo"; - - private static final int UNSUPPORTED_CATEGORY = 17; - - private static final int CATEGORY_COUNT = 29; - - /** - * A cache mapping character category integers, as returned by - * Character.getType(), to inversion lists. Entries are initially - * null and are created on demand. - */ - private static final UnicodeSet[] CATEGORY_CACHE = - new UnicodeSet[CATEGORY_COUNT]; - //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- @@ -408,19 +357,27 @@ public class UnicodeSet extends UnicodeFilter { applyPattern(pattern, pos, symbols, true); } + private static final String CATEGORY_NAMES = + // 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 + //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8 + "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo"; /** - * Constructs a set from the given Unicode character category. + * DEPRECATED - Constructs a set from the given Unicode character + * category. * @param category an integer indicating the character category as - * returned by Character.getType(). + * returned by java.lang.Character.getType(). Note + * that this is different from the UCharacterCategory + * codes. * @exception java.lang.IllegalArgumentException if the given * category is invalid. + * @deprecated this will be removed Dec-31-2001 */ public UnicodeSet(int category) { - if (category < 0 || category >= CATEGORY_COUNT || - category == UNSUPPORTED_CATEGORY) { + if (category < 0 || category > java.lang.Character.OTHER_SYMBOL || + category == 17) { throw new IllegalArgumentException("Invalid category"); } - set(getCategorySet(category)); + applyPattern(CATEGORY_NAMES.substring(2*category, 2*category+2), false); } /** @@ -489,6 +446,16 @@ public class UnicodeSet extends UnicodeFilter { } } + /** + * Return true if the given position, in the given pattern, appears + * to be the start of a UnicodeSet pattern. + */ + public static boolean resemblesPattern(String pattern, int pos) { + return ((pos+1) < pattern.length() && + pattern.charAt(pos) == '[') || + UnicodePropertySet.resemblesPattern(pattern, pos); + } + /** * Append the toPattern() representation of a * character to the given StringBuffer. @@ -509,6 +476,8 @@ public class UnicodeSet extends UnicodeFilter { case '^': // COMPLEMENT: case '&': // INTERSECTION: case '\\': //BACKSLASH: + case '{': + case '}': buf.append('\\'); break; default: @@ -607,28 +576,28 @@ public class UnicodeSet extends UnicodeFilter { } return result; } - + return _generatePattern(result, escapeUnprintable); } /** * Generate and append a string representation of this set to result. * This does not use this.pat, the cleaned up copy of the string - * passed to applyPattern(). + * passed to applyPattern(). */ public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { result.append('['); - // Check against the predefined categories. We implicitly build - // up ALL category sets the first time toPattern() is called. - for (int cat=0; cat 2 || anchor == 1) { throw new IllegalArgumentException("Syntax error near $" + pattern); - + } if (anchor == 2) { rebuildPattern = true; @@ -1524,16 +1484,24 @@ public class UnicodeSet extends UnicodeFilter { } } - if (lastChar != NONE) { + if (mode < 4) { + throw new IllegalArgumentException("Missing ']'"); + } + + // Treat a trailing '$' as indicating ETHER. This code is only + // executed if symbols == NULL; otherwise other code parses the + // anchor. + if (lastChar == SymbolTable.SYMBOL_REF) { + rebuildPattern = true; + newPat.append(lastChar); + add(TransliterationRule.ETHER); + } + + else if (lastChar != NONE) { add(lastChar, lastChar); _appendToPat(newPat, lastChar, false); } -// if (mode == 0) { -// throw new IllegalArgumentException("Missing '[' in \"" + -// pattern.substring(start) + '"'); -// } - // Handle unprocessed stuff preceding the closing ']' if (lastOp == '-') { // Trailing '-' is treated as literal @@ -1543,7 +1511,9 @@ public class UnicodeSet extends UnicodeFilter { throw new IllegalArgumentException("Unquoted trailing " + lastOp); } - newPat.append(']'); + if (mode == 4) { + newPat.append(']'); + } /** * If we saw a '^' after the initial '[' of this pattern, then perform @@ -1553,21 +1523,6 @@ public class UnicodeSet extends UnicodeFilter { complement(); } - if (mode != 4) { - throw new IllegalArgumentException("Missing ']'"); - } - -// /** -// * i indexes the last character we parsed or is pattern.length(). In -// * the latter case, we have run off the end without finding a closing -// * ']'. Otherwise, we know i < pattern.length(), and we set the -// * ParsePosition to the next character to be parsed. -// */ -// if (i == limit) { -// throw new IllegalArgumentException("Missing ']' in \"" + -// pattern.substring(start) + '"'); -// } - pos.setIndex(i); // Use the rebuilt pattern (newPat) only if necessary. Prefer the @@ -1586,136 +1541,6 @@ public class UnicodeSet extends UnicodeFilter { } } - //---------------------------------------------------------------- - // Implementation: Generation of Unicode categories - //---------------------------------------------------------------- - - /** - * Sets this object to the given category, given its name. - * The category name must be either a two-letter name, such as - * "Lu", or a one letter name, such as "L". One-letter names - * indicate the logical union of all two-letter names that start - * with that letter. Case is significant. If the name starts - * with the character '^' then the complement of the given - * character set is returned. - * - * Although individual categories such as "Lu" are cached, we do - * not currently cache single-letter categories such as "L" or - * complements such as "^Lu" or "^L". It would be easy to cache - * these as well in a hashtable should the need arise. - * - * NEW: The category name can now be a script name, as defined - * by UScript. - */ - private void applyCategory(String catName) { - boolean invert = (catName.length() > 1 && - catName.charAt(0) == '^'); - if (invert) { - catName = catName.substring(1); - } - - boolean match = false; - - // BE CAREFUL not to modify the return value from - // getCategorySet(int). - - // if we have two characters, search the category map for that - // code and either construct and return a UnicodeSet from the - // data in the category map or throw an exception - if (catName.length() == 2) { - int i = CATEGORY_NAMES.indexOf(catName); - if (i>=0 && i%2==0) { - i /= 2; - if (i != UNSUPPORTED_CATEGORY) { - set(getCategorySet(i)); - match = true; - } - } - } else if (catName.length() == 1) { - // if we have one character, search the category map for - // codes beginning with that letter, and union together - // all of the matching sets that we find (or throw an - // exception if there are no matches) - clear(); - for (int i=0; i= 0) { - add(start, end); - } - start = end = i; - } - } - } - if (start >= 0) { - add(start, end); - } - } - } - - if (!match) { - throw new IllegalArgumentException("Illegal category [:" + catName + ":]"); - } - - if (invert) { - complement(); - } - } - - /** - * Returns an inversion list for the given category. This list is - * cached and returned again if this method is called again with - * the same parameter. - * - * Callers MUST NOT MODIFY the returned set. - */ - private static UnicodeSet getCategorySet(int cat) { - if (CATEGORY_CACHE[cat] == null) { - // Walk through all Unicode characters, noting the start - // and end of each range for which Character.getType(c) - // returns the given category integer. - UnicodeSet set = new UnicodeSet(); - int start = -1; - int end = -2; - for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) { - if (Character.getType((char)i) == cat) { - if ((end+1) == i) { - end = i; - } else { - if (start >= 0) { - set.add(start, end); - } - start = end = i; - } - } - } - if (start >= 0) { - set.add(start, end); - } - CATEGORY_CACHE[cat] = set; - } - return CATEGORY_CACHE[cat]; - } - //---------------------------------------------------------------- // Implementation: Utility methods //---------------------------------------------------------------- diff --git a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java index e1248c883d3..4ba63831b1c 100755 --- a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java +++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $ - * $Date: 2001/10/10 20:23:27 $ - * $Revision: 1.52 $ + * $Date: 2001/10/17 19:19:00 $ + * $Revision: 1.53 $ * ***************************************************************************************** */ @@ -1318,7 +1318,7 @@ public class TransliteratorTest extends TestFmwk { public TestFact(String theID) { id = theID; } - public Transliterator getInstance() { + public Transliterator getInstance(String ignoredID) { return new NameableNullTrans(id); } }; @@ -1533,6 +1533,15 @@ public class TransliteratorTest extends TestFmwk { } } + /** + * Test new property set syntax + */ + public void TestPropertySet() { + expect("a>A; \\p{Lu}>x; \\p{ALL}>y;", "abcDEF", "Ayyxxx"); + expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9", + "[ a stitch ]\n[ in time ]\r[ saves 9]"); + } + //====================================================================== // icu4j ONLY // These tests are not mirrored (yet) in icu4c at @@ -1551,6 +1560,10 @@ public class TransliteratorTest extends TestFmwk { } } + public void TestDebugIndic() { + expect("'-'h\\u0323>a;", "-h\u0323", "a"); + } + //====================================================================== // Ram's tests //====================================================================== diff --git a/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java b/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java index 45e5402d242..6ce0445a9ed 100755 --- a/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java +++ b/icu4j/src/com/ibm/test/translit/UnicodeSetTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeSetTest.java,v $ - * $Date: 2001/10/10 21:35:33 $ - * $Revision: 1.13 $ + * $Date: 2001/10/17 19:17:59 $ + * $Revision: 1.14 $ * ***************************************************************************************** */ @@ -52,7 +52,7 @@ public class UnicodeSetTest extends TestFmwk { // not used int TOP = 0x200; // Don't need to go over the whole range: set = new UnicodeSet("[:L:]"); for (int i=0; i<0x200; ++i) { - boolean l = Character.isLetter((char)i); + boolean l = UCharacter.isLetter(i); if (l != set.contains((char)i)) { errln("FAIL: L contains " + (char)i + " = " + set.contains((char)i)); @@ -62,7 +62,7 @@ public class UnicodeSetTest extends TestFmwk { set = new UnicodeSet("[:Lu:]"); for (int i=0; i<0x200; ++i) { - boolean lu = (Character.getType((char)i) == Character.UPPERCASE_LETTER); + boolean lu = (UCharacter.getType(i) == UCharacterCategory.UPPERCASE_LETTER); if (lu != set.contains((char)i)) { errln("FAIL: Lu contains " + (char)i + " = " + set.contains((char)i)); @@ -249,11 +249,13 @@ public class UnicodeSetTest extends TestFmwk { /** * Test the [:Latin:] syntax. */ - public void TestScriptSet() { + public void TestPropertySet() { UnicodeSet set = new UnicodeSet("[:Latin:]"); expectContainment(set, "aA", "\u0391\u03B1"); - set = new UnicodeSet("[:Greek:]"); + set = new UnicodeSet("[\\p{Greek}]"); expectContainment(set, "\u0391\u03B1", "aA"); + set = new UnicodeSet("\\P{ GENERAL Category = upper case letter }"); + expectContainment(set, "abc", "ABC"); } /** @@ -453,7 +455,7 @@ public class UnicodeSetTest extends TestFmwk { } } if (bad.length() > 0) { - logln(Utility.escape("Fail: set " + set + " does not contain " + bad + + errln(Utility.escape("FAIL: set " + set + " does not contain " + bad + ", expected containment of " + charsIn)); } else { logln(Utility.escape("Ok: set " + set + " contains " + charsIn)); @@ -468,7 +470,7 @@ public class UnicodeSetTest extends TestFmwk { } } if (bad.length() > 0) { - logln(Utility.escape("Fail: set " + set + " contains " + bad + + errln(Utility.escape("FAIL: set " + set + " contains " + bad + ", expected non-containment of " + charsOut)); } else { logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut)); diff --git a/icu4j/src/com/ibm/text/Quantifier.java b/icu4j/src/com/ibm/text/Quantifier.java index 7af81947757..365c1ce645b 100755 --- a/icu4j/src/com/ibm/text/Quantifier.java +++ b/icu4j/src/com/ibm/text/Quantifier.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/Quantifier.java,v $ - * $Date: 2001/10/04 18:24:15 $ - * $Revision: 1.1 $ + * $Date: 2001/10/17 19:17:06 $ + * $Revision: 1.2 $ * ***************************************************************************************** */ @@ -45,9 +45,15 @@ class Quantifier implements UnicodeMatcher { int start = offset[0]; int count = 0; while (count < maxCount) { + int pos = offset[0]; int m = matcher.matches(text, offset, limit, incremental); if (m == U_MATCH) { ++count; + if (pos == offset[0]) { + // If offset has not moved we have a zero-width match. + // Don't keep matching it infinitely. + break; + } } else if (incremental && m == U_PARTIAL_MATCH) { return U_PARTIAL_MATCH; } else { diff --git a/icu4j/src/com/ibm/text/TransliteratorParser.java b/icu4j/src/com/ibm/text/TransliteratorParser.java index 096cfaf44dc..3ebcca71a2e 100755 --- a/icu4j/src/com/ibm/text/TransliteratorParser.java +++ b/icu4j/src/com/ibm/text/TransliteratorParser.java @@ -1,3 +1,13 @@ +/* +********************************************************************** +* Copyright (c) 2001, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $ +* $Date: 2001/10/17 19:17:06 $ +* $Revision: 1.4 $ +********************************************************************** +*/ package com.ibm.text; import com.ibm.text.resources.ResourceReader; @@ -85,6 +95,13 @@ class TransliteratorParser { */ private String undefinedVariableName; + /** + * The stand-in character for the 'dot' set, represented by '.' in + * patterns. This is allocated the first time it is needed, and + * reused thereafter. + */ + private int dotStandIn = -1; + //---------------------------------------------------------------------- // Constants //---------------------------------------------------------------------- @@ -109,8 +126,6 @@ class TransliteratorParser { private static final char CONTEXT_ANTE = '{'; // ante{key private static final char CONTEXT_POST = '}'; // key}post - private static final char SET_OPEN = '['; - private static final char SET_CLOSE = ']'; private static final char CURSOR_POS = '|'; private static final char CURSOR_OFFSET = '@'; private static final char ANCHOR_START = '^'; @@ -119,6 +134,9 @@ class TransliteratorParser { private static final char ONE_OR_MORE = '+'; private static final char ZERO_OR_ONE = '?'; + private static final char DOT = '.'; + private static final String DOT_SET = "[^[:Zp:][:Zl:]\r\n$]"; + // By definition, the ANCHOR_END special character is a // trailing SymbolTable.SYMBOL_REF character. // private static final char ANCHOR_END = '$'; @@ -541,6 +559,15 @@ class TransliteratorParser { // Text after a presumed end anchor is a syntax err syntaxError("Malformed variable reference", rule, start); } + if (UnicodeSet.resemblesPattern(rule, pos-1)) { + if (pp == null) { + pp = new ParsePosition(0); + } + pp.setIndex(pos-1); // Backup to opening '[' + buf.append(parser.parseSet(rule, pp)); + pos = pp.getIndex(); + continue; + } // Handle escapes if (c == ESCAPE) { if (pos == limit) { @@ -682,14 +709,6 @@ class TransliteratorParser { } post = buf.length(); break; - case SET_OPEN: - if (pp == null) { - pp = new ParsePosition(0); - } - pp.setIndex(pos-1); // Backup to opening '[' - buf.append(parser.parseSet(rule, pp)); - pos = pp.getIndex(); - break; case CURSOR_POS: if (cursor >= 0) { syntaxError("Multiple cursors", rule, start); @@ -718,6 +737,9 @@ class TransliteratorParser { } } break; + case DOT: + buf.append(parser.getDotStandIn()); + break; case KLEENE_STAR: case ONE_OR_MORE: case ZERO_OR_ONE: @@ -783,7 +805,6 @@ class TransliteratorParser { buf.append(parser.generateStandInFor(m)); } break; - // case SET_CLOSE: default: // Disallow unquoted characters other than [0-9A-Za-z] // in the printable ASCII range. These characters are @@ -1357,6 +1378,17 @@ class TransliteratorParser { return variableNext++; } + /** + * Return the stand-in for the dot set. It is allocated the first + * time and reused thereafter. + */ + char getDotStandIn() { + if (dotStandIn == -1) { + dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET)); + } + return (char) dotStandIn; + } + /** * Append the value of the given variable name to the given * StringBuffer. diff --git a/icu4j/src/com/ibm/text/UnicodePropertySet.java b/icu4j/src/com/ibm/text/UnicodePropertySet.java new file mode 100755 index 00000000000..72d2339262d --- /dev/null +++ b/icu4j/src/com/ibm/text/UnicodePropertySet.java @@ -0,0 +1,590 @@ +/* +********************************************************************** +* Copyright (c) 2001, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodePropertySet.java,v $ +* $Date: 2001/10/17 19:17:06 $ +* $Revision: 1.1 $ +********************************************************************** +*/ +package com.ibm.text; + +import java.text.*; +import java.util.*; +import com.ibm.util.Utility; + +/** + * INTERNAL CLASS implementing the UnicodeSet properties as outlined + * at: + * + * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html + * + * Recognized syntax: + * + * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]" + * \p{foo} \P{foo} - white space not allowed within "\p" or "\P" + * + * Other than the above restrictions, white space is ignored. Case + * is ignored except in "\p" and "\P". + * + * This class cannot be instantiated. It has a public static method, + * createPropertySet(), with takes a pattern to be parsed and returns + * a new UnicodeSet. Another public static method, + * resemblesPattern(), returns true if a given pattern string appears + * to be a property set pattern, and therefore should be passed in to + * createPropertySet(). + * + * NOTE: Current implementation is incomplete. The following list + * indicates which properties are supported. + * + * + GeneralCategory + * CombiningClass + * BidiClass + * DecompositionType + * + NumericValue + * NumericType + * EastAsianWidth + * LineBreak + * JoiningType + * + Script + * + * '+' indicates a supported property. + * + * @author Alan Liu + * @version $RCSfile: UnicodePropertySet.java,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:17:06 $ + */ +class UnicodePropertySet { + + private static final Hashtable NAME_MAP = new Hashtable(); + + private static final Hashtable CATEGORY_MAP = new Hashtable(); + + /** + * A cache mapping character category integers, as returned by + * UCharacter.getType(), to sets. Entries are initially + * null and are created on demand. + */ + private static final UnicodeSet[] CATEGORY_CACHE = + new UnicodeSet[UCharacterCategory.CHAR_CATEGORY_COUNT]; + + /** + * A cache mapping script integers, as defined by + * UScript, to sets. Entries are initially + * null and are created on demand. + */ + private static final UnicodeSet[] SCRIPT_CACHE = + new UnicodeSet[UScript.CODE_LIMIT]; + + // Special value codes + private static final int ANY = -1; // general category: all code points + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + + /** + * Return true if the given position, in the given pattern, appears + * to be the start of a property set pattern [:foo:], \p{foo}, or + * \P{foo}. + */ + public static boolean resemblesPattern(String pattern, int pos) { + // Patterns are at least 5 characters long + if ((pos+5) > pattern.length()) { + return false; + } + + // Look for an opening [:, [:^, \p, or \P + return pattern.regionMatches(pos, "[:", 0, 2) || + pattern.regionMatches(true, pos, "\\p", 0, 2); + } + + /** + * Create a UnicodeSet by parsing the given pattern at the given + * parse position. + * + * @param pattern the pattern string + * @param ppos on entry, the position at which to begin parsing. + * This shold be one of the locations marked '^': + * + * [:blah:] \p{blah} \P{blah} + * ^ % ^ % ^ % + * + * On return, the position after the last character parsed, that is, + * the locations marked '%'. If the parse fails, ppos is returned + * unchanged. + * @return a newly-constructed UnicodeSet object, or null upon + * failure. + */ + public static UnicodeSet createFromPattern(String pattern, ParsePosition ppos) { + + UnicodeSet set = null; + + int pos = ppos.getIndex(); + + // On entry, ppos should point to one of the following locations: + + // Minimum length is 5 characters, e.g. \p{L} + if ((pos+5) > pattern.length()) { + return null; + } + + boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat} + boolean invert = false; + + // Look for an opening [:, [:^, \p, or \P + if (pattern.regionMatches(pos, "[:", 0, 2)) { + posix = true; + pos = skipWhitespace(pattern, pos+2); + if (pos < pattern.length() && pattern.charAt(pos) == '^') { + ++pos; + invert = true; + } + } else if (pattern.regionMatches(true, pos, "\\p", 0, 2)) { + invert = (pattern.charAt(pos+1) == 'P'); + pos = skipWhitespace(pattern, pos+2); + if (pos == pattern.length() || pattern.charAt(pos++) != '{') { + // Syntax error; "\p" or "\P" not followed by "{" + return null; + } + } else { + // Open delimiter not seen + return null; + } + + // Look for the matching close delimiter, either :] or } + int close = pattern.indexOf(posix ? ":]" : "}", pos); + if (close < 0) { + // Syntax error; close delimiter missing + return null; + } + + // Look for an '=' sign. If this is present, we will parse a + // medium \p{gc=Cf} or long \p{GeneralCategory=Format} + // pattern. + int equals = pattern.indexOf('=', pos); + if (equals >= 0 && equals < close) { + // Equals seen; parse medium/long pattern + String typeName = munge(pattern, pos, equals); + String valueName = munge(pattern, equals+1, close); + SetFactory factory; + factory = (SetFactory) NAME_MAP.get(typeName); + if (factory == null) { + // Syntax error; type name not recognized + return null; + } + set = factory.create(valueName); + } else { + // No equals seen; parse short format \p{Cf} + String shortName = munge(pattern, pos, close); + + // First try general category + set = createCategorySet(shortName); + + // If this fails, try script + if (set == null) { + set = createScriptSet(shortName); + } + } + + if (invert) { + set.complement(); + } + + // Move to the limit position after the close delimiter + ppos.setIndex(close + (posix ? 2 : 1)); + + return set; + } + + //---------------------------------------------------------------- + // Property set factory classes + // NOTE: This will change/go away when we implement UCharacter + // based property retrieval. + //---------------------------------------------------------------- + + static interface SetFactory { + + UnicodeSet create(String valueName); + } + + static class NumericValueFactory implements SetFactory { + NumericValueFactory() {} + public UnicodeSet create(String valueName) { + double value = Double.parseDouble(valueName); + final int ivalue = (int) value; + if (ivalue != value || ivalue < 0) { + // UCharacter doesn't support negative or non-integral + // values, so just return an empty set + return new UnicodeSet(); + } + return createSetFromFilter(new Filter() { + public boolean contains(int cp) { + return UCharacter.getUnicodeNumericValue(cp) == ivalue; + } + }); + } + } + + //---------------------------------------------------------------- + // Property set factory static methods + // NOTE: This will change/go away when we implement UCharacter + // based property retrieval. + //---------------------------------------------------------------- + + /** + * Given a general category value name, create a corresponding + * set and return it, or return null if the name is invalid. + * @param valueName a pre-munged general category value name + */ + private static UnicodeSet createCategorySet(String valueName) { + Integer valueObj; + valueObj = (Integer) CATEGORY_MAP.get(valueName); + if (valueObj == null) { + return null; + } + int valueCode = valueObj.intValue(); + + UnicodeSet set = new UnicodeSet(); + if (valueCode == ANY) { + set.complement(); + return set; + } + for (int cat=0; cat= 0) { + set.add(start, end); + } + start = end = i; + } + } + } + if (start >= 0) { + set.add(start, end); + } + return set; + } + + //---------------------------------------------------------------- + // Type and value name maps + //---------------------------------------------------------------- + + /** + * Add a type mapping to the name map. + */ + private static void addType(String shortName, String longName, + SetFactory factory) { + // DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD + if (true) { + if (NAME_MAP.get(shortName) != null) { + throw new InternalError("Duplicate name " + shortName); + } + if (NAME_MAP.get(longName) != null) { + throw new InternalError("Duplicate name " + longName); + } + } + + NAME_MAP.put(shortName, factory); + NAME_MAP.put(longName, factory); + } + + /** + * Add a value mapping to the name map. + */ + private static void addValue(Hashtable map, + String shortName, String longName, + int value) { + // DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD + if (true) { + if (map.get(shortName) != null) { + throw new InternalError("Duplicate name " + shortName); + } + if (longName != null && map.get(longName) != null) { + throw new InternalError("Duplicate name " + longName); + } + } + + Integer valueObj = new Integer(value); + map.put(shortName, valueObj); + if (longName != null) { + map.put(longName, valueObj); + } + } + + static { + // NOTE: We munge all search keys to have no whitespace + // and upper case. As such, all stored keys should have + // this format. + + // Load the map with type data + + addType("GC", "GENERALCATEGORY", new SetFactory() { + public UnicodeSet create(String valueName) { + return createCategorySet(valueName); + } + }); + + //addType("CC", "COMBININGCLASS", COMBINING_CLASS); + //addType("BC", "BIDICLASS", BIDI_CLASS); + //addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE); + + addType("NV", "NUMERICVALUE", new NumericValueFactory()); + + //addType("NT", "NUMERICTYPE", NUMERIC_TYPE); + //addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH); + //addType("LB", "LINEBREAK", LINE_BREAK); + //addType("JT", "JOININGTYPE", JOINING_TYPE); + + addType("SC", "SCRIPT", new SetFactory() { + public UnicodeSet create(String valueName) { + return createScriptSet(valueName); + } + }); + + // Load the map with value data + + // General Category + + addValue(CATEGORY_MAP, "ANY", null, ANY); // special case + + addValue(CATEGORY_MAP, "C", "OTHER", + (1 << UCharacterCategory.CONTROL) | + (1 << UCharacterCategory.FORMAT) | + (1 << UCharacterCategory.GENERAL_OTHER_TYPES) | + (1 << UCharacterCategory.PRIVATE_USE) | + (1 << UCharacterCategory.SURROGATE)); + + addValue(CATEGORY_MAP, "CC", "CONTROL", + 1 << UCharacterCategory.CONTROL); + addValue(CATEGORY_MAP, "CF", "FORMAT", + 1 << UCharacterCategory.FORMAT); + addValue(CATEGORY_MAP, "CN", "UNASSIGNED", + 1 << UCharacterCategory.GENERAL_OTHER_TYPES); + addValue(CATEGORY_MAP, "CO", "PRIVATEUSE", + 1 << UCharacterCategory.PRIVATE_USE); + addValue(CATEGORY_MAP, "CS", "SURROGATE", + 1 << UCharacterCategory.SURROGATE); + + addValue(CATEGORY_MAP, "L", "LETTER", + (1 << UCharacterCategory.LOWERCASE_LETTER) | + (1 << UCharacterCategory.MODIFIER_LETTER) | + (1 << UCharacterCategory.OTHER_LETTER) | + (1 << UCharacterCategory.TITLECASE_LETTER) | + (1 << UCharacterCategory.UPPERCASE_LETTER)); + + addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER", + 1 << UCharacterCategory.LOWERCASE_LETTER); + addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER", + 1 << UCharacterCategory.MODIFIER_LETTER); + addValue(CATEGORY_MAP, "LO", "OTHERLETTER", + 1 << UCharacterCategory.OTHER_LETTER); + addValue(CATEGORY_MAP, "LT", "TITLECASELETTER", + 1 << UCharacterCategory.TITLECASE_LETTER); + addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER", + 1 << UCharacterCategory.UPPERCASE_LETTER); + + addValue(CATEGORY_MAP, "M", "MARK", + (1 << UCharacterCategory.NON_SPACING_MARK) | + (1 << UCharacterCategory.COMBINING_SPACING_MARK) | + (1 << UCharacterCategory.ENCLOSING_MARK)); + + addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK", + 1 << UCharacterCategory.NON_SPACING_MARK); + addValue(CATEGORY_MAP, "MC", "SPACINGMARK", + 1 << UCharacterCategory.COMBINING_SPACING_MARK); + addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK", + 1 << UCharacterCategory.ENCLOSING_MARK); + + addValue(CATEGORY_MAP, "N", "NUMBER", + (1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) | + (1 << UCharacterCategory.LETTER_NUMBER) | + (1 << UCharacterCategory.OTHER_NUMBER)); + + addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER", + 1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER); + addValue(CATEGORY_MAP, "NL", "LETTERNUMBER", + 1 << UCharacterCategory.LETTER_NUMBER); + addValue(CATEGORY_MAP, "NO", "OTHERNUMBER", + 1 << UCharacterCategory.OTHER_NUMBER); + + addValue(CATEGORY_MAP, "P", "PUNCTUATION", + (1 << UCharacterCategory.CONNECTOR_PUNCTUATION) | + (1 << UCharacterCategory.DASH_PUNCTUATION) | + (1 << UCharacterCategory.END_PUNCTUATION) | + (1 << UCharacterCategory.FINAL_PUNCTUATION) | + (1 << UCharacterCategory.INITIAL_PUNCTUATION) | + (1 << UCharacterCategory.OTHER_PUNCTUATION) | + (1 << UCharacterCategory.START_PUNCTUATION)); + + addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION", + 1 << UCharacterCategory.CONNECTOR_PUNCTUATION); + addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION", + 1 << UCharacterCategory.DASH_PUNCTUATION); + addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION", + 1 << UCharacterCategory.END_PUNCTUATION); + addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION", + 1 << UCharacterCategory.FINAL_PUNCTUATION); + addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION", + 1 << UCharacterCategory.INITIAL_PUNCTUATION); + addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION", + 1 << UCharacterCategory.OTHER_PUNCTUATION); + addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION", + 1 << UCharacterCategory.START_PUNCTUATION); + + addValue(CATEGORY_MAP, "S", "SYMBOL", + (1 << UCharacterCategory.CURRENCY_SYMBOL) | + (1 << UCharacterCategory.MODIFIER_SYMBOL) | + (1 << UCharacterCategory.MATH_SYMBOL) | + (1 << UCharacterCategory.OTHER_SYMBOL)); + + addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL", + 1 << UCharacterCategory.CURRENCY_SYMBOL); + addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL", + 1 << UCharacterCategory.MODIFIER_SYMBOL); + addValue(CATEGORY_MAP, "SM", "MATHSYMBOL", + 1 << UCharacterCategory.MATH_SYMBOL); + addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL", + 1 << UCharacterCategory.OTHER_SYMBOL); + + addValue(CATEGORY_MAP, "Z", "SEPARATOR", + (1 << UCharacterCategory.LINE_SEPARATOR) | + (1 << UCharacterCategory.PARAGRAPH_SEPARATOR) | + (1 << UCharacterCategory.SPACE_SEPARATOR)); + + addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR", + 1 << UCharacterCategory.LINE_SEPARATOR); + addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR", + 1 << UCharacterCategory.PARAGRAPH_SEPARATOR); + addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR", + 1 << UCharacterCategory.SPACE_SEPARATOR); + } +} diff --git a/icu4j/src/com/ibm/text/UnicodeSet.java b/icu4j/src/com/ibm/text/UnicodeSet.java index 9631baea563..31d25f8dc07 100755 --- a/icu4j/src/com/ibm/text/UnicodeSet.java +++ b/icu4j/src/com/ibm/text/UnicodeSet.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $ - * $Date: 2001/10/10 21:35:05 $ - * $Revision: 1.39 $ + * $Date: 2001/10/17 19:17:06 $ + * $Revision: 1.40 $ * ***************************************************************************************** */ @@ -202,60 +202,26 @@ import com.ibm.util.Utility; * starting wih 'L', that is, [[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]. * * - *

Character categories. + *

Character properties. * - * Character categories are specified using the POSIX-like syntax - * '[:Lu:]'. The complement of a category is specified by inserting - * '^' after the opening '[:'. The following category names are - * recognized. Actual determination of category data uses - * Character.getType(), so it reflects the underlying - * implmementation used by Character. As of Java 2 and - * JDK 1.1.8, this is Unicode 2.1.2. + *

Character properties are specified using the POSIX-like syntax + * "[:Lu:]" or the Perl-like syntax "\p{Lu}". The complement of a + * category is specified as "[:^Lu:]" or "\P{Lu}". Actual + * determination of category data is accomplished by UCharacter using + * the underlying Unicode database. * - *

- * Normative
- *     Mn = Mark, Non-Spacing
- *     Mc = Mark, Spacing Combining
- *     Me = Mark, Enclosing
+ * 

For details of the property syntax please see this + * + * draft document. * - * Nd = Number, Decimal Digit - * Nl = Number, Letter - * No = Number, Other - * - * Zs = Separator, Space - * Zl = Separator, Line - * Zp = Separator, Paragraph - * - * Cc = Other, Control - * Cf = Other, Format - * Cs = Other, Surrogate - * Co = Other, Private Use - * Cn = Other, Not Assigned - * - * Informative - * Lu = Letter, Uppercase - * Ll = Letter, Lowercase - * Lt = Letter, Titlecase - * Lm = Letter, Modifier - * Lo = Letter, Other - * - * Pc = Punctuation, Connector - * Pd = Punctuation, Dash - * Ps = Punctuation, Open - * Pe = Punctuation, Close - * *Pi = Punctuation, Initial quote - * *Pf = Punctuation, Final quote - * Po = Punctuation, Other - * - * Sm = Symbol, Math - * Sc = Symbol, Currency - * Sk = Symbol, Modifier - * So = Symbol, Other - *

- * *Unsupported by Java (and hence unsupported by UnicodeSet). + *

Note: Not all properties are currently supported. + * Currently, only the general category, script, and numeric value + * properties are supported. Support for other properties will be + * added in the future. * * @author Alan Liu - * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.39 $ $Date: 2001/10/10 21:35:05 $ */ + * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.40 $ $Date: 2001/10/17 19:17:06 $ + */ public class UnicodeSet extends UnicodeFilter { /* Implementation Notes. @@ -313,29 +279,12 @@ public class UnicodeSet extends UnicodeFilter { * modified using the non-pattern API, this string will be null, * indicating that toPattern() must generate a pattern * representation from the inversion list. - */ + */ private String pat = null; private static final int START_EXTRA = 16; // initial storage. Must be >= 0 private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0 - private static final String CATEGORY_NAMES = - // 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 - //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8 - "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo"; - - private static final int UNSUPPORTED_CATEGORY = 17; - - private static final int CATEGORY_COUNT = 29; - - /** - * A cache mapping character category integers, as returned by - * Character.getType(), to inversion lists. Entries are initially - * null and are created on demand. - */ - private static final UnicodeSet[] CATEGORY_CACHE = - new UnicodeSet[CATEGORY_COUNT]; - //---------------------------------------------------------------- // Public API //---------------------------------------------------------------- @@ -408,19 +357,27 @@ public class UnicodeSet extends UnicodeFilter { applyPattern(pattern, pos, symbols, true); } + private static final String CATEGORY_NAMES = + // 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 + //0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8 + "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo"; /** - * Constructs a set from the given Unicode character category. + * DEPRECATED - Constructs a set from the given Unicode character + * category. * @param category an integer indicating the character category as - * returned by Character.getType(). + * returned by java.lang.Character.getType(). Note + * that this is different from the UCharacterCategory + * codes. * @exception java.lang.IllegalArgumentException if the given * category is invalid. + * @deprecated this will be removed Dec-31-2001 */ public UnicodeSet(int category) { - if (category < 0 || category >= CATEGORY_COUNT || - category == UNSUPPORTED_CATEGORY) { + if (category < 0 || category > java.lang.Character.OTHER_SYMBOL || + category == 17) { throw new IllegalArgumentException("Invalid category"); } - set(getCategorySet(category)); + applyPattern(CATEGORY_NAMES.substring(2*category, 2*category+2), false); } /** @@ -489,6 +446,16 @@ public class UnicodeSet extends UnicodeFilter { } } + /** + * Return true if the given position, in the given pattern, appears + * to be the start of a UnicodeSet pattern. + */ + public static boolean resemblesPattern(String pattern, int pos) { + return ((pos+1) < pattern.length() && + pattern.charAt(pos) == '[') || + UnicodePropertySet.resemblesPattern(pattern, pos); + } + /** * Append the toPattern() representation of a * character to the given StringBuffer. @@ -509,6 +476,8 @@ public class UnicodeSet extends UnicodeFilter { case '^': // COMPLEMENT: case '&': // INTERSECTION: case '\\': //BACKSLASH: + case '{': + case '}': buf.append('\\'); break; default: @@ -607,28 +576,28 @@ public class UnicodeSet extends UnicodeFilter { } return result; } - + return _generatePattern(result, escapeUnprintable); } /** * Generate and append a string representation of this set to result. * This does not use this.pat, the cleaned up copy of the string - * passed to applyPattern(). + * passed to applyPattern(). */ public StringBuffer _generatePattern(StringBuffer result, boolean escapeUnprintable) { result.append('['); - // Check against the predefined categories. We implicitly build - // up ALL category sets the first time toPattern() is called. - for (int cat=0; cat 2 || anchor == 1) { throw new IllegalArgumentException("Syntax error near $" + pattern); - + } if (anchor == 2) { rebuildPattern = true; @@ -1524,16 +1484,24 @@ public class UnicodeSet extends UnicodeFilter { } } - if (lastChar != NONE) { + if (mode < 4) { + throw new IllegalArgumentException("Missing ']'"); + } + + // Treat a trailing '$' as indicating ETHER. This code is only + // executed if symbols == NULL; otherwise other code parses the + // anchor. + if (lastChar == SymbolTable.SYMBOL_REF) { + rebuildPattern = true; + newPat.append(lastChar); + add(TransliterationRule.ETHER); + } + + else if (lastChar != NONE) { add(lastChar, lastChar); _appendToPat(newPat, lastChar, false); } -// if (mode == 0) { -// throw new IllegalArgumentException("Missing '[' in \"" + -// pattern.substring(start) + '"'); -// } - // Handle unprocessed stuff preceding the closing ']' if (lastOp == '-') { // Trailing '-' is treated as literal @@ -1543,7 +1511,9 @@ public class UnicodeSet extends UnicodeFilter { throw new IllegalArgumentException("Unquoted trailing " + lastOp); } - newPat.append(']'); + if (mode == 4) { + newPat.append(']'); + } /** * If we saw a '^' after the initial '[' of this pattern, then perform @@ -1553,21 +1523,6 @@ public class UnicodeSet extends UnicodeFilter { complement(); } - if (mode != 4) { - throw new IllegalArgumentException("Missing ']'"); - } - -// /** -// * i indexes the last character we parsed or is pattern.length(). In -// * the latter case, we have run off the end without finding a closing -// * ']'. Otherwise, we know i < pattern.length(), and we set the -// * ParsePosition to the next character to be parsed. -// */ -// if (i == limit) { -// throw new IllegalArgumentException("Missing ']' in \"" + -// pattern.substring(start) + '"'); -// } - pos.setIndex(i); // Use the rebuilt pattern (newPat) only if necessary. Prefer the @@ -1586,136 +1541,6 @@ public class UnicodeSet extends UnicodeFilter { } } - //---------------------------------------------------------------- - // Implementation: Generation of Unicode categories - //---------------------------------------------------------------- - - /** - * Sets this object to the given category, given its name. - * The category name must be either a two-letter name, such as - * "Lu", or a one letter name, such as "L". One-letter names - * indicate the logical union of all two-letter names that start - * with that letter. Case is significant. If the name starts - * with the character '^' then the complement of the given - * character set is returned. - * - * Although individual categories such as "Lu" are cached, we do - * not currently cache single-letter categories such as "L" or - * complements such as "^Lu" or "^L". It would be easy to cache - * these as well in a hashtable should the need arise. - * - * NEW: The category name can now be a script name, as defined - * by UScript. - */ - private void applyCategory(String catName) { - boolean invert = (catName.length() > 1 && - catName.charAt(0) == '^'); - if (invert) { - catName = catName.substring(1); - } - - boolean match = false; - - // BE CAREFUL not to modify the return value from - // getCategorySet(int). - - // if we have two characters, search the category map for that - // code and either construct and return a UnicodeSet from the - // data in the category map or throw an exception - if (catName.length() == 2) { - int i = CATEGORY_NAMES.indexOf(catName); - if (i>=0 && i%2==0) { - i /= 2; - if (i != UNSUPPORTED_CATEGORY) { - set(getCategorySet(i)); - match = true; - } - } - } else if (catName.length() == 1) { - // if we have one character, search the category map for - // codes beginning with that letter, and union together - // all of the matching sets that we find (or throw an - // exception if there are no matches) - clear(); - for (int i=0; i= 0) { - add(start, end); - } - start = end = i; - } - } - } - if (start >= 0) { - add(start, end); - } - } - } - - if (!match) { - throw new IllegalArgumentException("Illegal category [:" + catName + ":]"); - } - - if (invert) { - complement(); - } - } - - /** - * Returns an inversion list for the given category. This list is - * cached and returned again if this method is called again with - * the same parameter. - * - * Callers MUST NOT MODIFY the returned set. - */ - private static UnicodeSet getCategorySet(int cat) { - if (CATEGORY_CACHE[cat] == null) { - // Walk through all Unicode characters, noting the start - // and end of each range for which Character.getType(c) - // returns the given category integer. - UnicodeSet set = new UnicodeSet(); - int start = -1; - int end = -2; - for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) { - if (Character.getType((char)i) == cat) { - if ((end+1) == i) { - end = i; - } else { - if (start >= 0) { - set.add(start, end); - } - start = end = i; - } - } - } - if (start >= 0) { - set.add(start, end); - } - CATEGORY_CACHE[cat] = set; - } - return CATEGORY_CACHE[cat]; - } - //---------------------------------------------------------------- // Implementation: Utility methods //----------------------------------------------------------------