mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-86 initial implementation of perl-ish character property syntax for UnicodeSet
X-SVN-Rev: 6280
This commit is contained in:
parent
e33659c6ef
commit
a4a66fdc7f
12 changed files with 1590 additions and 654 deletions
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $
|
||||
* $Date: 2001/10/10 20:23:27 $
|
||||
* $Revision: 1.52 $
|
||||
* $Date: 2001/10/17 19:19:00 $
|
||||
* $Revision: 1.53 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -1318,7 +1318,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
public TestFact(String theID) {
|
||||
id = theID;
|
||||
}
|
||||
public Transliterator getInstance() {
|
||||
public Transliterator getInstance(String ignoredID) {
|
||||
return new NameableNullTrans(id);
|
||||
}
|
||||
};
|
||||
|
@ -1533,6 +1533,15 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test new property set syntax
|
||||
*/
|
||||
public void TestPropertySet() {
|
||||
expect("a>A; \\p{Lu}>x; \\p{ALL}>y;", "abcDEF", "Ayyxxx");
|
||||
expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
|
||||
"[ a stitch ]\n[ in time ]\r[ saves 9]");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// icu4j ONLY
|
||||
// These tests are not mirrored (yet) in icu4c at
|
||||
|
@ -1551,6 +1560,10 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
public void TestDebugIndic() {
|
||||
expect("'-'h\\u0323>a;", "-h\u0323", "a");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Ram's tests
|
||||
//======================================================================
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java,v $
|
||||
* $Date: 2001/10/10 21:35:33 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2001/10/17 19:17:59 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -52,7 +52,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
// not used int TOP = 0x200; // Don't need to go over the whole range:
|
||||
set = new UnicodeSet("[:L:]");
|
||||
for (int i=0; i<0x200; ++i) {
|
||||
boolean l = Character.isLetter((char)i);
|
||||
boolean l = UCharacter.isLetter(i);
|
||||
if (l != set.contains((char)i)) {
|
||||
errln("FAIL: L contains " + (char)i + " = " +
|
||||
set.contains((char)i));
|
||||
|
@ -62,7 +62,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
|
||||
set = new UnicodeSet("[:Lu:]");
|
||||
for (int i=0; i<0x200; ++i) {
|
||||
boolean lu = (Character.getType((char)i) == Character.UPPERCASE_LETTER);
|
||||
boolean lu = (UCharacter.getType(i) == UCharacterCategory.UPPERCASE_LETTER);
|
||||
if (lu != set.contains((char)i)) {
|
||||
errln("FAIL: Lu contains " + (char)i + " = " +
|
||||
set.contains((char)i));
|
||||
|
@ -249,11 +249,13 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
/**
|
||||
* Test the [:Latin:] syntax.
|
||||
*/
|
||||
public void TestScriptSet() {
|
||||
public void TestPropertySet() {
|
||||
UnicodeSet set = new UnicodeSet("[:Latin:]");
|
||||
expectContainment(set, "aA", "\u0391\u03B1");
|
||||
set = new UnicodeSet("[:Greek:]");
|
||||
set = new UnicodeSet("[\\p{Greek}]");
|
||||
expectContainment(set, "\u0391\u03B1", "aA");
|
||||
set = new UnicodeSet("\\P{ GENERAL Category = upper case letter }");
|
||||
expectContainment(set, "abc", "ABC");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -453,7 +455,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
if (bad.length() > 0) {
|
||||
logln(Utility.escape("Fail: set " + set + " does not contain " + bad +
|
||||
errln(Utility.escape("FAIL: set " + set + " does not contain " + bad +
|
||||
", expected containment of " + charsIn));
|
||||
} else {
|
||||
logln(Utility.escape("Ok: set " + set + " contains " + charsIn));
|
||||
|
@ -468,7 +470,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
if (bad.length() > 0) {
|
||||
logln(Utility.escape("Fail: set " + set + " contains " + bad +
|
||||
errln(Utility.escape("FAIL: set " + set + " contains " + bad +
|
||||
", expected non-containment of " + charsOut));
|
||||
} else {
|
||||
logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut));
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Quantifier.java,v $
|
||||
* $Date: 2001/10/04 18:24:15 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2001/10/17 19:17:06 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -45,9 +45,15 @@ class Quantifier implements UnicodeMatcher {
|
|||
int start = offset[0];
|
||||
int count = 0;
|
||||
while (count < maxCount) {
|
||||
int pos = offset[0];
|
||||
int m = matcher.matches(text, offset, limit, incremental);
|
||||
if (m == U_MATCH) {
|
||||
++count;
|
||||
if (pos == offset[0]) {
|
||||
// If offset has not moved we have a zero-width match.
|
||||
// Don't keep matching it infinitely.
|
||||
break;
|
||||
}
|
||||
} else if (incremental && m == U_PARTIAL_MATCH) {
|
||||
return U_PARTIAL_MATCH;
|
||||
} else {
|
||||
|
|
|
@ -1,3 +1,13 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/TransliteratorParser.java,v $
|
||||
* $Date: 2001/10/17 19:17:06 $
|
||||
* $Revision: 1.4 $
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.text;
|
||||
|
||||
import com.ibm.text.resources.ResourceReader;
|
||||
|
@ -85,6 +95,13 @@ class TransliteratorParser {
|
|||
*/
|
||||
private String undefinedVariableName;
|
||||
|
||||
/**
|
||||
* The stand-in character for the 'dot' set, represented by '.' in
|
||||
* patterns. This is allocated the first time it is needed, and
|
||||
* reused thereafter.
|
||||
*/
|
||||
private int dotStandIn = -1;
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Constants
|
||||
//----------------------------------------------------------------------
|
||||
|
@ -109,8 +126,6 @@ class TransliteratorParser {
|
|||
|
||||
private static final char CONTEXT_ANTE = '{'; // ante{key
|
||||
private static final char CONTEXT_POST = '}'; // key}post
|
||||
private static final char SET_OPEN = '[';
|
||||
private static final char SET_CLOSE = ']';
|
||||
private static final char CURSOR_POS = '|';
|
||||
private static final char CURSOR_OFFSET = '@';
|
||||
private static final char ANCHOR_START = '^';
|
||||
|
@ -119,6 +134,9 @@ class TransliteratorParser {
|
|||
private static final char ONE_OR_MORE = '+';
|
||||
private static final char ZERO_OR_ONE = '?';
|
||||
|
||||
private static final char DOT = '.';
|
||||
private static final String DOT_SET = "[^[:Zp:][:Zl:]\r\n$]";
|
||||
|
||||
// By definition, the ANCHOR_END special character is a
|
||||
// trailing SymbolTable.SYMBOL_REF character.
|
||||
// private static final char ANCHOR_END = '$';
|
||||
|
@ -541,6 +559,15 @@ class TransliteratorParser {
|
|||
// Text after a presumed end anchor is a syntax err
|
||||
syntaxError("Malformed variable reference", rule, start);
|
||||
}
|
||||
if (UnicodeSet.resemblesPattern(rule, pos-1)) {
|
||||
if (pp == null) {
|
||||
pp = new ParsePosition(0);
|
||||
}
|
||||
pp.setIndex(pos-1); // Backup to opening '['
|
||||
buf.append(parser.parseSet(rule, pp));
|
||||
pos = pp.getIndex();
|
||||
continue;
|
||||
}
|
||||
// Handle escapes
|
||||
if (c == ESCAPE) {
|
||||
if (pos == limit) {
|
||||
|
@ -682,14 +709,6 @@ class TransliteratorParser {
|
|||
}
|
||||
post = buf.length();
|
||||
break;
|
||||
case SET_OPEN:
|
||||
if (pp == null) {
|
||||
pp = new ParsePosition(0);
|
||||
}
|
||||
pp.setIndex(pos-1); // Backup to opening '['
|
||||
buf.append(parser.parseSet(rule, pp));
|
||||
pos = pp.getIndex();
|
||||
break;
|
||||
case CURSOR_POS:
|
||||
if (cursor >= 0) {
|
||||
syntaxError("Multiple cursors", rule, start);
|
||||
|
@ -718,6 +737,9 @@ class TransliteratorParser {
|
|||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
|
@ -783,7 +805,6 @@ class TransliteratorParser {
|
|||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
// case SET_CLOSE:
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
// in the printable ASCII range. These characters are
|
||||
|
@ -1357,6 +1378,17 @@ class TransliteratorParser {
|
|||
return variableNext++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the stand-in for the dot set. It is allocated the first
|
||||
* time and reused thereafter.
|
||||
*/
|
||||
char getDotStandIn() {
|
||||
if (dotStandIn == -1) {
|
||||
dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
|
||||
}
|
||||
return (char) dotStandIn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the value of the given variable name to the given
|
||||
* StringBuffer.
|
||||
|
|
590
icu4j/src/com/ibm/icu/text/UnicodePropertySet.java
Executable file
590
icu4j/src/com/ibm/icu/text/UnicodePropertySet.java
Executable file
|
@ -0,0 +1,590 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Attic/UnicodePropertySet.java,v $
|
||||
* $Date: 2001/10/17 19:17:06 $
|
||||
* $Revision: 1.1 $
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.text;
|
||||
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
import com.ibm.util.Utility;
|
||||
|
||||
/**
|
||||
* INTERNAL CLASS implementing the UnicodeSet properties as outlined
|
||||
* at:
|
||||
*
|
||||
* http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
|
||||
*
|
||||
* Recognized syntax:
|
||||
*
|
||||
* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
|
||||
* \p{foo} \P{foo} - white space not allowed within "\p" or "\P"
|
||||
*
|
||||
* Other than the above restrictions, white space is ignored. Case
|
||||
* is ignored except in "\p" and "\P".
|
||||
*
|
||||
* This class cannot be instantiated. It has a public static method,
|
||||
* createPropertySet(), with takes a pattern to be parsed and returns
|
||||
* a new UnicodeSet. Another public static method,
|
||||
* resemblesPattern(), returns true if a given pattern string appears
|
||||
* to be a property set pattern, and therefore should be passed in to
|
||||
* createPropertySet().
|
||||
*
|
||||
* NOTE: Current implementation is incomplete. The following list
|
||||
* indicates which properties are supported.
|
||||
*
|
||||
* + GeneralCategory
|
||||
* CombiningClass
|
||||
* BidiClass
|
||||
* DecompositionType
|
||||
* + NumericValue
|
||||
* NumericType
|
||||
* EastAsianWidth
|
||||
* LineBreak
|
||||
* JoiningType
|
||||
* + Script
|
||||
*
|
||||
* '+' indicates a supported property.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodePropertySet.java,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:17:06 $
|
||||
*/
|
||||
class UnicodePropertySet {
|
||||
|
||||
private static final Hashtable NAME_MAP = new Hashtable();
|
||||
|
||||
private static final Hashtable CATEGORY_MAP = new Hashtable();
|
||||
|
||||
/**
|
||||
* A cache mapping character category integers, as returned by
|
||||
* UCharacter.getType(), to sets. Entries are initially
|
||||
* null and are created on demand.
|
||||
*/
|
||||
private static final UnicodeSet[] CATEGORY_CACHE =
|
||||
new UnicodeSet[UCharacterCategory.CHAR_CATEGORY_COUNT];
|
||||
|
||||
/**
|
||||
* A cache mapping script integers, as defined by
|
||||
* UScript, to sets. Entries are initially
|
||||
* null and are created on demand.
|
||||
*/
|
||||
private static final UnicodeSet[] SCRIPT_CACHE =
|
||||
new UnicodeSet[UScript.CODE_LIMIT];
|
||||
|
||||
// Special value codes
|
||||
private static final int ANY = -1; // general category: all code points
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Public API
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return true if the given position, in the given pattern, appears
|
||||
* to be the start of a property set pattern [:foo:], \p{foo}, or
|
||||
* \P{foo}.
|
||||
*/
|
||||
public static boolean resemblesPattern(String pattern, int pos) {
|
||||
// Patterns are at least 5 characters long
|
||||
if ((pos+5) > pattern.length()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look for an opening [:, [:^, \p, or \P
|
||||
return pattern.regionMatches(pos, "[:", 0, 2) ||
|
||||
pattern.regionMatches(true, pos, "\\p", 0, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a UnicodeSet by parsing the given pattern at the given
|
||||
* parse position.
|
||||
*
|
||||
* @param pattern the pattern string
|
||||
* @param ppos on entry, the position at which to begin parsing.
|
||||
* This shold be one of the locations marked '^':
|
||||
*
|
||||
* [:blah:] \p{blah} \P{blah}
|
||||
* ^ % ^ % ^ %
|
||||
*
|
||||
* On return, the position after the last character parsed, that is,
|
||||
* the locations marked '%'. If the parse fails, ppos is returned
|
||||
* unchanged.
|
||||
* @return a newly-constructed UnicodeSet object, or null upon
|
||||
* failure.
|
||||
*/
|
||||
public static UnicodeSet createFromPattern(String pattern, ParsePosition ppos) {
|
||||
|
||||
UnicodeSet set = null;
|
||||
|
||||
int pos = ppos.getIndex();
|
||||
|
||||
// On entry, ppos should point to one of the following locations:
|
||||
|
||||
// Minimum length is 5 characters, e.g. \p{L}
|
||||
if ((pos+5) > pattern.length()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat}
|
||||
boolean invert = false;
|
||||
|
||||
// Look for an opening [:, [:^, \p, or \P
|
||||
if (pattern.regionMatches(pos, "[:", 0, 2)) {
|
||||
posix = true;
|
||||
pos = skipWhitespace(pattern, pos+2);
|
||||
if (pos < pattern.length() && pattern.charAt(pos) == '^') {
|
||||
++pos;
|
||||
invert = true;
|
||||
}
|
||||
} else if (pattern.regionMatches(true, pos, "\\p", 0, 2)) {
|
||||
invert = (pattern.charAt(pos+1) == 'P');
|
||||
pos = skipWhitespace(pattern, pos+2);
|
||||
if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
|
||||
// Syntax error; "\p" or "\P" not followed by "{"
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
// Open delimiter not seen
|
||||
return null;
|
||||
}
|
||||
|
||||
// Look for the matching close delimiter, either :] or }
|
||||
int close = pattern.indexOf(posix ? ":]" : "}", pos);
|
||||
if (close < 0) {
|
||||
// Syntax error; close delimiter missing
|
||||
return null;
|
||||
}
|
||||
|
||||
// Look for an '=' sign. If this is present, we will parse a
|
||||
// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
|
||||
// pattern.
|
||||
int equals = pattern.indexOf('=', pos);
|
||||
if (equals >= 0 && equals < close) {
|
||||
// Equals seen; parse medium/long pattern
|
||||
String typeName = munge(pattern, pos, equals);
|
||||
String valueName = munge(pattern, equals+1, close);
|
||||
SetFactory factory;
|
||||
factory = (SetFactory) NAME_MAP.get(typeName);
|
||||
if (factory == null) {
|
||||
// Syntax error; type name not recognized
|
||||
return null;
|
||||
}
|
||||
set = factory.create(valueName);
|
||||
} else {
|
||||
// No equals seen; parse short format \p{Cf}
|
||||
String shortName = munge(pattern, pos, close);
|
||||
|
||||
// First try general category
|
||||
set = createCategorySet(shortName);
|
||||
|
||||
// If this fails, try script
|
||||
if (set == null) {
|
||||
set = createScriptSet(shortName);
|
||||
}
|
||||
}
|
||||
|
||||
if (invert) {
|
||||
set.complement();
|
||||
}
|
||||
|
||||
// Move to the limit position after the close delimiter
|
||||
ppos.setIndex(close + (posix ? 2 : 1));
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Property set factory classes
|
||||
// NOTE: This will change/go away when we implement UCharacter
|
||||
// based property retrieval.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
static interface SetFactory {
|
||||
|
||||
UnicodeSet create(String valueName);
|
||||
}
|
||||
|
||||
static class NumericValueFactory implements SetFactory {
|
||||
NumericValueFactory() {}
|
||||
public UnicodeSet create(String valueName) {
|
||||
double value = Double.parseDouble(valueName);
|
||||
final int ivalue = (int) value;
|
||||
if (ivalue != value || ivalue < 0) {
|
||||
// UCharacter doesn't support negative or non-integral
|
||||
// values, so just return an empty set
|
||||
return new UnicodeSet();
|
||||
}
|
||||
return createSetFromFilter(new Filter() {
|
||||
public boolean contains(int cp) {
|
||||
return UCharacter.getUnicodeNumericValue(cp) == ivalue;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Property set factory static methods
|
||||
// NOTE: This will change/go away when we implement UCharacter
|
||||
// based property retrieval.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Given a general category value name, create a corresponding
|
||||
* set and return it, or return null if the name is invalid.
|
||||
* @param valueName a pre-munged general category value name
|
||||
*/
|
||||
private static UnicodeSet createCategorySet(String valueName) {
|
||||
Integer valueObj;
|
||||
valueObj = (Integer) CATEGORY_MAP.get(valueName);
|
||||
if (valueObj == null) {
|
||||
return null;
|
||||
}
|
||||
int valueCode = valueObj.intValue();
|
||||
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
if (valueCode == ANY) {
|
||||
set.complement();
|
||||
return set;
|
||||
}
|
||||
for (int cat=0; cat<UCharacterCategory.CHAR_CATEGORY_COUNT; ++cat) {
|
||||
if ((valueCode & (1 << cat)) != 0) {
|
||||
set.addAll(UnicodePropertySet.getCategorySet(cat));
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a script value name, create a corresponding set and
|
||||
* return it, or return null if the name is invalid.
|
||||
* @param valueName a pre-munged script value name
|
||||
*/
|
||||
private static UnicodeSet createScriptSet(String valueName) {
|
||||
int script = UScript.getCode(valueName);
|
||||
if (script == UScript.INVALID_CODE) {
|
||||
// Syntax error; unknown short name
|
||||
return null;
|
||||
}
|
||||
return new UnicodeSet(getScriptSet(script));
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns a UnicodeSet for the given category. This set is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
private static UnicodeSet getCategorySet(final int cat) {
|
||||
if (CATEGORY_CACHE[cat] == null) {
|
||||
CATEGORY_CACHE[cat] =
|
||||
createSetFromFilter(new Filter() {
|
||||
public boolean contains(int cp) {
|
||||
return UCharacter.getType(cp) == cat;
|
||||
}
|
||||
});
|
||||
}
|
||||
return CATEGORY_CACHE[cat];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a UnicodeSet for the given script. This set is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
private static UnicodeSet getScriptSet(final int script) {
|
||||
if (SCRIPT_CACHE[script] == null) {
|
||||
SCRIPT_CACHE[script] =
|
||||
createSetFromFilter(new Filter() {
|
||||
public boolean contains(int cp) {
|
||||
return UScript.getScript(cp) == script;
|
||||
}
|
||||
});
|
||||
}
|
||||
return SCRIPT_CACHE[script];
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a string, munge it to upper case and lose the whitespace.
|
||||
* So "General Category " becomes "GENERALCATEGORY". We munge all
|
||||
* type and value strings, and store all type and value keys
|
||||
* pre-munged.
|
||||
*/
|
||||
private static String munge(String str, int start, int limit) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=start; i<limit; ) {
|
||||
int c = UTF16.charAt(str, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
if (!UCharacter.isWhitespace(c)) {
|
||||
UTF16.append(buf, UCharacter.toUpperCase(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip over a sequence of zero or more white space characters
|
||||
* at pos. Return the index of the first non-white-space character
|
||||
* at or after pos, or str.length(), if there is none.
|
||||
*/
|
||||
private static int skipWhitespace(String str, int pos) {
|
||||
while (pos < str.length()) {
|
||||
int c = UTF16.charAt(str, pos);
|
||||
if (!UCharacter.isWhitespace(c)) {
|
||||
break;
|
||||
}
|
||||
pos += UTF16.getCharCount(c);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Generic filter-based scanning code
|
||||
//
|
||||
// NOTE: In general, we don't want to do this! This is a temporary
|
||||
// implementation until we have time for something that examines
|
||||
// the underlying UCharacter data structures in an intelligent
|
||||
// way. Iterating over all code points is dumb. What we want to
|
||||
// do, for instance, is iterate over internally-stored ranges
|
||||
// of characters that have a given property.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
static interface Filter {
|
||||
boolean contains(int codePoint);
|
||||
}
|
||||
|
||||
static UnicodeSet createSetFromFilter(Filter filter) {
|
||||
// Walk through all Unicode characters, noting the start
|
||||
// and end of each range for which filter.contain(c) is
|
||||
// true. Add each range to a set.
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
int start = -1;
|
||||
int end = -2;
|
||||
|
||||
// TODO Extend this up to UnicodeSet.MAX_VALUE when we have
|
||||
// better performance; i.e., when this code can get moved into
|
||||
// the UCharacter class and not have to iterate over code
|
||||
// points. Right now it's way too slow to iterate to 10FFFF.
|
||||
|
||||
for (int i=UnicodeSet.MIN_VALUE; i<=0xFFFF; ++i) {
|
||||
if (filter.contains(i)) {
|
||||
if ((end+1) == i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
set.add(start, end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
set.add(start, end);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Type and value name maps
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Add a type mapping to the name map.
|
||||
*/
|
||||
private static void addType(String shortName, String longName,
|
||||
SetFactory factory) {
|
||||
// DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD
|
||||
if (true) {
|
||||
if (NAME_MAP.get(shortName) != null) {
|
||||
throw new InternalError("Duplicate name " + shortName);
|
||||
}
|
||||
if (NAME_MAP.get(longName) != null) {
|
||||
throw new InternalError("Duplicate name " + longName);
|
||||
}
|
||||
}
|
||||
|
||||
NAME_MAP.put(shortName, factory);
|
||||
NAME_MAP.put(longName, factory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a value mapping to the name map.
|
||||
*/
|
||||
private static void addValue(Hashtable map,
|
||||
String shortName, String longName,
|
||||
int value) {
|
||||
// DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD
|
||||
if (true) {
|
||||
if (map.get(shortName) != null) {
|
||||
throw new InternalError("Duplicate name " + shortName);
|
||||
}
|
||||
if (longName != null && map.get(longName) != null) {
|
||||
throw new InternalError("Duplicate name " + longName);
|
||||
}
|
||||
}
|
||||
|
||||
Integer valueObj = new Integer(value);
|
||||
map.put(shortName, valueObj);
|
||||
if (longName != null) {
|
||||
map.put(longName, valueObj);
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
// NOTE: We munge all search keys to have no whitespace
|
||||
// and upper case. As such, all stored keys should have
|
||||
// this format.
|
||||
|
||||
// Load the map with type data
|
||||
|
||||
addType("GC", "GENERALCATEGORY", new SetFactory() {
|
||||
public UnicodeSet create(String valueName) {
|
||||
return createCategorySet(valueName);
|
||||
}
|
||||
});
|
||||
|
||||
//addType("CC", "COMBININGCLASS", COMBINING_CLASS);
|
||||
//addType("BC", "BIDICLASS", BIDI_CLASS);
|
||||
//addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);
|
||||
|
||||
addType("NV", "NUMERICVALUE", new NumericValueFactory());
|
||||
|
||||
//addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
|
||||
//addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
|
||||
//addType("LB", "LINEBREAK", LINE_BREAK);
|
||||
//addType("JT", "JOININGTYPE", JOINING_TYPE);
|
||||
|
||||
addType("SC", "SCRIPT", new SetFactory() {
|
||||
public UnicodeSet create(String valueName) {
|
||||
return createScriptSet(valueName);
|
||||
}
|
||||
});
|
||||
|
||||
// Load the map with value data
|
||||
|
||||
// General Category
|
||||
|
||||
addValue(CATEGORY_MAP, "ANY", null, ANY); // special case
|
||||
|
||||
addValue(CATEGORY_MAP, "C", "OTHER",
|
||||
(1 << UCharacterCategory.CONTROL) |
|
||||
(1 << UCharacterCategory.FORMAT) |
|
||||
(1 << UCharacterCategory.GENERAL_OTHER_TYPES) |
|
||||
(1 << UCharacterCategory.PRIVATE_USE) |
|
||||
(1 << UCharacterCategory.SURROGATE));
|
||||
|
||||
addValue(CATEGORY_MAP, "CC", "CONTROL",
|
||||
1 << UCharacterCategory.CONTROL);
|
||||
addValue(CATEGORY_MAP, "CF", "FORMAT",
|
||||
1 << UCharacterCategory.FORMAT);
|
||||
addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
|
||||
1 << UCharacterCategory.GENERAL_OTHER_TYPES);
|
||||
addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
|
||||
1 << UCharacterCategory.PRIVATE_USE);
|
||||
addValue(CATEGORY_MAP, "CS", "SURROGATE",
|
||||
1 << UCharacterCategory.SURROGATE);
|
||||
|
||||
addValue(CATEGORY_MAP, "L", "LETTER",
|
||||
(1 << UCharacterCategory.LOWERCASE_LETTER) |
|
||||
(1 << UCharacterCategory.MODIFIER_LETTER) |
|
||||
(1 << UCharacterCategory.OTHER_LETTER) |
|
||||
(1 << UCharacterCategory.TITLECASE_LETTER) |
|
||||
(1 << UCharacterCategory.UPPERCASE_LETTER));
|
||||
|
||||
addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
|
||||
1 << UCharacterCategory.LOWERCASE_LETTER);
|
||||
addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
|
||||
1 << UCharacterCategory.MODIFIER_LETTER);
|
||||
addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
|
||||
1 << UCharacterCategory.OTHER_LETTER);
|
||||
addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
|
||||
1 << UCharacterCategory.TITLECASE_LETTER);
|
||||
addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
|
||||
1 << UCharacterCategory.UPPERCASE_LETTER);
|
||||
|
||||
addValue(CATEGORY_MAP, "M", "MARK",
|
||||
(1 << UCharacterCategory.NON_SPACING_MARK) |
|
||||
(1 << UCharacterCategory.COMBINING_SPACING_MARK) |
|
||||
(1 << UCharacterCategory.ENCLOSING_MARK));
|
||||
|
||||
addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
|
||||
1 << UCharacterCategory.NON_SPACING_MARK);
|
||||
addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
|
||||
1 << UCharacterCategory.COMBINING_SPACING_MARK);
|
||||
addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
|
||||
1 << UCharacterCategory.ENCLOSING_MARK);
|
||||
|
||||
addValue(CATEGORY_MAP, "N", "NUMBER",
|
||||
(1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
|
||||
(1 << UCharacterCategory.LETTER_NUMBER) |
|
||||
(1 << UCharacterCategory.OTHER_NUMBER));
|
||||
|
||||
addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
|
||||
1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER);
|
||||
addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
|
||||
1 << UCharacterCategory.LETTER_NUMBER);
|
||||
addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
|
||||
1 << UCharacterCategory.OTHER_NUMBER);
|
||||
|
||||
addValue(CATEGORY_MAP, "P", "PUNCTUATION",
|
||||
(1 << UCharacterCategory.CONNECTOR_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.DASH_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.END_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.FINAL_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.INITIAL_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.OTHER_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.START_PUNCTUATION));
|
||||
|
||||
addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
|
||||
1 << UCharacterCategory.CONNECTOR_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
|
||||
1 << UCharacterCategory.DASH_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
|
||||
1 << UCharacterCategory.END_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
|
||||
1 << UCharacterCategory.FINAL_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
|
||||
1 << UCharacterCategory.INITIAL_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
|
||||
1 << UCharacterCategory.OTHER_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
|
||||
1 << UCharacterCategory.START_PUNCTUATION);
|
||||
|
||||
addValue(CATEGORY_MAP, "S", "SYMBOL",
|
||||
(1 << UCharacterCategory.CURRENCY_SYMBOL) |
|
||||
(1 << UCharacterCategory.MODIFIER_SYMBOL) |
|
||||
(1 << UCharacterCategory.MATH_SYMBOL) |
|
||||
(1 << UCharacterCategory.OTHER_SYMBOL));
|
||||
|
||||
addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
|
||||
1 << UCharacterCategory.CURRENCY_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
|
||||
1 << UCharacterCategory.MODIFIER_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
|
||||
1 << UCharacterCategory.MATH_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
|
||||
1 << UCharacterCategory.OTHER_SYMBOL);
|
||||
|
||||
addValue(CATEGORY_MAP, "Z", "SEPARATOR",
|
||||
(1 << UCharacterCategory.LINE_SEPARATOR) |
|
||||
(1 << UCharacterCategory.PARAGRAPH_SEPARATOR) |
|
||||
(1 << UCharacterCategory.SPACE_SEPARATOR));
|
||||
|
||||
addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
|
||||
1 << UCharacterCategory.LINE_SEPARATOR);
|
||||
addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
|
||||
1 << UCharacterCategory.PARAGRAPH_SEPARATOR);
|
||||
addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
|
||||
1 << UCharacterCategory.SPACE_SEPARATOR);
|
||||
}
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
|
||||
* $Date: 2001/10/10 21:35:05 $
|
||||
* $Revision: 1.39 $
|
||||
* $Date: 2001/10/17 19:17:06 $
|
||||
* $Revision: 1.40 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -202,60 +202,26 @@ import com.ibm.util.Utility;
|
|||
* starting wih 'L', that is, <code>[[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]</code>.
|
||||
* </table>
|
||||
*
|
||||
* <p><b>Character categories.</b>
|
||||
* <p><b>Character properties.</b>
|
||||
*
|
||||
* Character categories are specified using the POSIX-like syntax
|
||||
* '[:Lu:]'. The complement of a category is specified by inserting
|
||||
* '^' after the opening '[:'. The following category names are
|
||||
* recognized. Actual determination of category data uses
|
||||
* <code>Character.getType()</code>, so it reflects the underlying
|
||||
* implmementation used by <code>Character</code>. As of Java 2 and
|
||||
* JDK 1.1.8, this is Unicode 2.1.2.
|
||||
* <p>Character properties are specified using the POSIX-like syntax
|
||||
* "[:Lu:]" or the Perl-like syntax "\p{Lu}". The complement of a
|
||||
* category is specified as "[:^Lu:]" or "\P{Lu}". Actual
|
||||
* determination of category data is accomplished by UCharacter using
|
||||
* the underlying Unicode database.
|
||||
*
|
||||
* <pre>
|
||||
* Normative
|
||||
* Mn = Mark, Non-Spacing
|
||||
* Mc = Mark, Spacing Combining
|
||||
* Me = Mark, Enclosing
|
||||
* <p>For details of the property syntax please see this
|
||||
* <a href="http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html">
|
||||
* draft document</a>.
|
||||
*
|
||||
* Nd = Number, Decimal Digit
|
||||
* Nl = Number, Letter
|
||||
* No = Number, Other
|
||||
*
|
||||
* Zs = Separator, Space
|
||||
* Zl = Separator, Line
|
||||
* Zp = Separator, Paragraph
|
||||
*
|
||||
* Cc = Other, Control
|
||||
* Cf = Other, Format
|
||||
* Cs = Other, Surrogate
|
||||
* Co = Other, Private Use
|
||||
* Cn = Other, Not Assigned
|
||||
*
|
||||
* Informative
|
||||
* Lu = Letter, Uppercase
|
||||
* Ll = Letter, Lowercase
|
||||
* Lt = Letter, Titlecase
|
||||
* Lm = Letter, Modifier
|
||||
* Lo = Letter, Other
|
||||
*
|
||||
* Pc = Punctuation, Connector
|
||||
* Pd = Punctuation, Dash
|
||||
* Ps = Punctuation, Open
|
||||
* Pe = Punctuation, Close
|
||||
* *Pi = Punctuation, Initial quote
|
||||
* *Pf = Punctuation, Final quote
|
||||
* Po = Punctuation, Other
|
||||
*
|
||||
* Sm = Symbol, Math
|
||||
* Sc = Symbol, Currency
|
||||
* Sk = Symbol, Modifier
|
||||
* So = Symbol, Other
|
||||
* </pre>
|
||||
* *Unsupported by Java (and hence unsupported by UnicodeSet).
|
||||
* <p><em>Note:</em> Not all properties are currently supported.
|
||||
* Currently, only the general category, script, and numeric value
|
||||
* properties are supported. Support for other properties will be
|
||||
* added in the future.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.39 $ $Date: 2001/10/10 21:35:05 $ */
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.40 $ $Date: 2001/10/17 19:17:06 $
|
||||
*/
|
||||
public class UnicodeSet extends UnicodeFilter {
|
||||
|
||||
/* Implementation Notes.
|
||||
|
@ -313,29 +279,12 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
* modified using the non-pattern API, this string will be null,
|
||||
* indicating that toPattern() must generate a pattern
|
||||
* representation from the inversion list.
|
||||
*/
|
||||
*/
|
||||
private String pat = null;
|
||||
|
||||
private static final int START_EXTRA = 16; // initial storage. Must be >= 0
|
||||
private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
|
||||
|
||||
private static final String CATEGORY_NAMES =
|
||||
// 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
|
||||
//0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8
|
||||
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
|
||||
|
||||
private static final int UNSUPPORTED_CATEGORY = 17;
|
||||
|
||||
private static final int CATEGORY_COUNT = 29;
|
||||
|
||||
/**
|
||||
* A cache mapping character category integers, as returned by
|
||||
* Character.getType(), to inversion lists. Entries are initially
|
||||
* null and are created on demand.
|
||||
*/
|
||||
private static final UnicodeSet[] CATEGORY_CACHE =
|
||||
new UnicodeSet[CATEGORY_COUNT];
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Public API
|
||||
//----------------------------------------------------------------
|
||||
|
@ -408,19 +357,27 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
applyPattern(pattern, pos, symbols, true);
|
||||
}
|
||||
|
||||
private static final String CATEGORY_NAMES =
|
||||
// 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
|
||||
//0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8
|
||||
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
|
||||
/**
|
||||
* Constructs a set from the given Unicode character category.
|
||||
* DEPRECATED - Constructs a set from the given Unicode character
|
||||
* category.
|
||||
* @param category an integer indicating the character category as
|
||||
* returned by <code>Character.getType()</code>.
|
||||
* returned by <code>java.lang.Character.getType()</code>. Note
|
||||
* that this is <em>different</em> from the UCharacterCategory
|
||||
* codes.
|
||||
* @exception java.lang.IllegalArgumentException if the given
|
||||
* category is invalid.
|
||||
* @deprecated this will be removed Dec-31-2001
|
||||
*/
|
||||
public UnicodeSet(int category) {
|
||||
if (category < 0 || category >= CATEGORY_COUNT ||
|
||||
category == UNSUPPORTED_CATEGORY) {
|
||||
if (category < 0 || category > java.lang.Character.OTHER_SYMBOL ||
|
||||
category == 17) {
|
||||
throw new IllegalArgumentException("Invalid category");
|
||||
}
|
||||
set(getCategorySet(category));
|
||||
applyPattern(CATEGORY_NAMES.substring(2*category, 2*category+2), false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -489,6 +446,16 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the given position, in the given pattern, appears
|
||||
* to be the start of a UnicodeSet pattern.
|
||||
*/
|
||||
public static boolean resemblesPattern(String pattern, int pos) {
|
||||
return ((pos+1) < pattern.length() &&
|
||||
pattern.charAt(pos) == '[') ||
|
||||
UnicodePropertySet.resemblesPattern(pattern, pos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the <code>toPattern()</code> representation of a
|
||||
* character to the given <code>StringBuffer</code>.
|
||||
|
@ -509,6 +476,8 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
case '^': // COMPLEMENT:
|
||||
case '&': // INTERSECTION:
|
||||
case '\\': //BACKSLASH:
|
||||
case '{':
|
||||
case '}':
|
||||
buf.append('\\');
|
||||
break;
|
||||
default:
|
||||
|
@ -607,28 +576,28 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
return _generatePattern(result, escapeUnprintable);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate and append a string representation of this set to result.
|
||||
* This does not use this.pat, the cleaned up copy of the string
|
||||
* passed to applyPattern().
|
||||
* passed to applyPattern().
|
||||
*/
|
||||
public StringBuffer _generatePattern(StringBuffer result,
|
||||
boolean escapeUnprintable) {
|
||||
result.append('[');
|
||||
|
||||
// Check against the predefined categories. We implicitly build
|
||||
// up ALL category sets the first time toPattern() is called.
|
||||
for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
|
||||
if (this.equals(getCategorySet(cat))) {
|
||||
result.append(':');
|
||||
result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
|
||||
return result.append(":]");
|
||||
}
|
||||
}
|
||||
// // Check against the predefined categories. We implicitly build
|
||||
// // up ALL category sets the first time toPattern() is called.
|
||||
// for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
|
||||
// if (this.equals(getCategorySet(cat))) {
|
||||
// result.append(':');
|
||||
// result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
|
||||
// return result.append(":]");
|
||||
// }
|
||||
// }
|
||||
|
||||
int count = getRangeCount();
|
||||
|
||||
|
@ -1205,7 +1174,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
StringBuffer newPat = new StringBuffer("[");
|
||||
int nestedPatStart = -1; // see below for usage
|
||||
boolean nestedPatDone = false; // see below for usage
|
||||
|
||||
|
||||
boolean invert = false;
|
||||
clear();
|
||||
|
||||
|
@ -1231,8 +1200,9 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// mode 1: '[' seen; if next is '^' or ':' then special
|
||||
// mode 2: '[' '^'? seen; parse pattern and close with ']'
|
||||
// mode 3: '[:' seen; parse category and close with ':]'
|
||||
// mode 4: ']' seen; parse complete
|
||||
// mode 5: Top-level property pattern seen
|
||||
int mode = 0;
|
||||
int colonPos = 0; // Expected pos of ':' in '[:'
|
||||
int start = pos.getIndex();
|
||||
int i = start;
|
||||
int limit = pattern.length();
|
||||
|
@ -1285,9 +1255,11 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// Parse the opening '[' and optional following '^'
|
||||
switch (mode) {
|
||||
case 0:
|
||||
if (c == '[') {
|
||||
if (UnicodePropertySet.resemblesPattern(pattern, i-1)) {
|
||||
mode = 3;
|
||||
break; // Fall through
|
||||
} else if (c == '[') {
|
||||
mode = 1; // Next look for '^'
|
||||
colonPos = i; // Expect ':' at next offset
|
||||
continue;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Missing opening '['");
|
||||
|
@ -1299,17 +1271,6 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
invert = true;
|
||||
newPat.append((char) c);
|
||||
continue; // Back to top to fetch next character
|
||||
case ':':
|
||||
if (i-1 == colonPos) {
|
||||
// '[:' cannot have whitespace in it
|
||||
--i; // Backup to the '['
|
||||
c = '[';
|
||||
mode = 3;
|
||||
// Fall through and parse category using the same
|
||||
// code used to parse a nested category. The mode
|
||||
// will indicate that this is actually top level.
|
||||
}
|
||||
break; // Fall through
|
||||
case '-':
|
||||
isLiteral = true; // Treat leading '-' as a literal
|
||||
break; // Fall through
|
||||
|
@ -1326,12 +1287,47 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// buffer. Characters in the variable buffer have already
|
||||
// benn through escape and variable reference processing.
|
||||
if (varValueBuffer == null) {
|
||||
/**
|
||||
* Handle property set patterns.
|
||||
*/
|
||||
if (UnicodePropertySet.resemblesPattern(pattern, i-1)) {
|
||||
ParsePosition pp = new ParsePosition(i-1);
|
||||
nestedSet = UnicodePropertySet.createFromPattern(pattern, pp);
|
||||
if (nestedSet == null) {
|
||||
// assert(pp.getIndex() == i-1);
|
||||
throw new IllegalArgumentException("Invalid property pattern " +
|
||||
pattern.substring(i-1));
|
||||
}
|
||||
nestedPatStart = newPat.length();
|
||||
nestedPatDone = true; // we're going to do it just below
|
||||
|
||||
// If we have a top-level property pattern, then trim
|
||||
// off the opening '[' and use the property pattern
|
||||
// as the entire pattern.
|
||||
if (mode == 3) {
|
||||
newPat.deleteCharAt(0);
|
||||
}
|
||||
newPat.append(pattern.substring(i-1, pp.getIndex()));
|
||||
rebuildPattern = true;
|
||||
|
||||
i = pp.getIndex(); // advance past property pattern
|
||||
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse
|
||||
// loop. This is one of 2 ways we leave this
|
||||
// loop if the pattern is well-formed.
|
||||
set(nestedSet);
|
||||
mode = 5;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == '\\') {
|
||||
else if (c == '\\') {
|
||||
int[] offset = new int[] { i };
|
||||
int escaped = Utility.unescapeAt(pattern, offset);
|
||||
if (escaped == -1) {
|
||||
|
@ -1373,61 +1369,25 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedSet accordingly.
|
||||
* subpattern.
|
||||
*/
|
||||
else if (!isLiteral && c == '[') {
|
||||
// Record position before nested pattern
|
||||
nestedPatStart = newPat.length();
|
||||
|
||||
// Handle "[:...:]", representing a character category
|
||||
if (i < pattern.length() && pattern.charAt(i) == ':') {
|
||||
++i;
|
||||
int j = pattern.indexOf(":]", i);
|
||||
if (j < 0) {
|
||||
throw new IllegalArgumentException("Missing \":]\"");
|
||||
}
|
||||
String scratch = pattern.substring(i, j);
|
||||
nestedSet = new UnicodeSet();
|
||||
nestedSet.applyCategory(scratch);
|
||||
nestedPatDone = true; // We're going to do it just below
|
||||
i = j+2; // Advance i past ":]"
|
||||
|
||||
// Use a rebuilt pattern. If we are top level,
|
||||
// then there is already a SET_OPEN in newPat, and
|
||||
// SET_CLOSE will be appended elsewhere.
|
||||
if (mode != 3) {
|
||||
newPat.append('[');
|
||||
}
|
||||
newPat.append(':').append(scratch).append(':');
|
||||
if (mode != 3) {
|
||||
newPat.append(']');
|
||||
}
|
||||
rebuildPattern = true;
|
||||
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse
|
||||
// loop. This is one of 2 ways we leave this
|
||||
// loop if the pattern is well-formed.
|
||||
set(nestedSet);
|
||||
mode = 4;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
// Backup i to '['.
|
||||
pos.setIndex(--i);
|
||||
switch (lastOp) {
|
||||
case '-':
|
||||
case '&':
|
||||
newPat.append(lastOp);
|
||||
break;
|
||||
}
|
||||
nestedSet = new UnicodeSet();
|
||||
nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
|
||||
nestedPatDone = true;
|
||||
i = pos.getIndex();
|
||||
// Recurse to get the pairs for this nested set.
|
||||
// Backup i to '['.
|
||||
pos.setIndex(--i);
|
||||
switch (lastOp) {
|
||||
case '-':
|
||||
case '&':
|
||||
newPat.append(lastOp);
|
||||
break;
|
||||
}
|
||||
nestedSet = new UnicodeSet();
|
||||
nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
|
||||
nestedPatDone = true;
|
||||
i = pos.getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1487,7 +1447,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// loop if the pattern is well-formed.
|
||||
if (anchor > 2 || anchor == 1) {
|
||||
throw new IllegalArgumentException("Syntax error near $" + pattern);
|
||||
|
||||
|
||||
}
|
||||
if (anchor == 2) {
|
||||
rebuildPattern = true;
|
||||
|
@ -1524,16 +1484,24 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
}
|
||||
|
||||
if (lastChar != NONE) {
|
||||
if (mode < 4) {
|
||||
throw new IllegalArgumentException("Missing ']'");
|
||||
}
|
||||
|
||||
// Treat a trailing '$' as indicating ETHER. This code is only
|
||||
// executed if symbols == NULL; otherwise other code parses the
|
||||
// anchor.
|
||||
if (lastChar == SymbolTable.SYMBOL_REF) {
|
||||
rebuildPattern = true;
|
||||
newPat.append(lastChar);
|
||||
add(TransliterationRule.ETHER);
|
||||
}
|
||||
|
||||
else if (lastChar != NONE) {
|
||||
add(lastChar, lastChar);
|
||||
_appendToPat(newPat, lastChar, false);
|
||||
}
|
||||
|
||||
// if (mode == 0) {
|
||||
// throw new IllegalArgumentException("Missing '[' in \"" +
|
||||
// pattern.substring(start) + '"');
|
||||
// }
|
||||
|
||||
// Handle unprocessed stuff preceding the closing ']'
|
||||
if (lastOp == '-') {
|
||||
// Trailing '-' is treated as literal
|
||||
|
@ -1543,7 +1511,9 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
throw new IllegalArgumentException("Unquoted trailing " + lastOp);
|
||||
}
|
||||
|
||||
newPat.append(']');
|
||||
if (mode == 4) {
|
||||
newPat.append(']');
|
||||
}
|
||||
|
||||
/**
|
||||
* If we saw a '^' after the initial '[' of this pattern, then perform
|
||||
|
@ -1553,21 +1523,6 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
complement();
|
||||
}
|
||||
|
||||
if (mode != 4) {
|
||||
throw new IllegalArgumentException("Missing ']'");
|
||||
}
|
||||
|
||||
// /**
|
||||
// * i indexes the last character we parsed or is pattern.length(). In
|
||||
// * the latter case, we have run off the end without finding a closing
|
||||
// * ']'. Otherwise, we know i < pattern.length(), and we set the
|
||||
// * ParsePosition to the next character to be parsed.
|
||||
// */
|
||||
// if (i == limit) {
|
||||
// throw new IllegalArgumentException("Missing ']' in \"" +
|
||||
// pattern.substring(start) + '"');
|
||||
// }
|
||||
|
||||
pos.setIndex(i);
|
||||
|
||||
// Use the rebuilt pattern (newPat) only if necessary. Prefer the
|
||||
|
@ -1586,136 +1541,6 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Generation of Unicode categories
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Sets this object to the given category, given its name.
|
||||
* The category name must be either a two-letter name, such as
|
||||
* "Lu", or a one letter name, such as "L". One-letter names
|
||||
* indicate the logical union of all two-letter names that start
|
||||
* with that letter. Case is significant. If the name starts
|
||||
* with the character '^' then the complement of the given
|
||||
* character set is returned.
|
||||
*
|
||||
* Although individual categories such as "Lu" are cached, we do
|
||||
* not currently cache single-letter categories such as "L" or
|
||||
* complements such as "^Lu" or "^L". It would be easy to cache
|
||||
* these as well in a hashtable should the need arise.
|
||||
*
|
||||
* NEW: The category name can now be a script name, as defined
|
||||
* by UScript.
|
||||
*/
|
||||
private void applyCategory(String catName) {
|
||||
boolean invert = (catName.length() > 1 &&
|
||||
catName.charAt(0) == '^');
|
||||
if (invert) {
|
||||
catName = catName.substring(1);
|
||||
}
|
||||
|
||||
boolean match = false;
|
||||
|
||||
// BE CAREFUL not to modify the return value from
|
||||
// getCategorySet(int).
|
||||
|
||||
// if we have two characters, search the category map for that
|
||||
// code and either construct and return a UnicodeSet from the
|
||||
// data in the category map or throw an exception
|
||||
if (catName.length() == 2) {
|
||||
int i = CATEGORY_NAMES.indexOf(catName);
|
||||
if (i>=0 && i%2==0) {
|
||||
i /= 2;
|
||||
if (i != UNSUPPORTED_CATEGORY) {
|
||||
set(getCategorySet(i));
|
||||
match = true;
|
||||
}
|
||||
}
|
||||
} else if (catName.length() == 1) {
|
||||
// if we have one character, search the category map for
|
||||
// codes beginning with that letter, and union together
|
||||
// all of the matching sets that we find (or throw an
|
||||
// exception if there are no matches)
|
||||
clear();
|
||||
for (int i=0; i<CATEGORY_COUNT; ++i) {
|
||||
if (i != UNSUPPORTED_CATEGORY &&
|
||||
CATEGORY_NAMES.charAt(2*i) == catName.charAt(0)) {
|
||||
addAll(getCategorySet(i));
|
||||
match = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!match) {
|
||||
// TODO: Add caching of these, if desired
|
||||
int script = UScript.getCode(catName);
|
||||
if (script != UScript.INVALID_CODE) {
|
||||
match = true;
|
||||
clear();
|
||||
int start = -1;
|
||||
int end = -2;
|
||||
for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) {
|
||||
if (UScript.getScript(i) == script) {
|
||||
if ((end+1) == i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
add(start, end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
add(start, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!match) {
|
||||
throw new IllegalArgumentException("Illegal category [:" + catName + ":]");
|
||||
}
|
||||
|
||||
if (invert) {
|
||||
complement();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an inversion list for the given category. This list is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
private static UnicodeSet getCategorySet(int cat) {
|
||||
if (CATEGORY_CACHE[cat] == null) {
|
||||
// Walk through all Unicode characters, noting the start
|
||||
// and end of each range for which Character.getType(c)
|
||||
// returns the given category integer.
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
int start = -1;
|
||||
int end = -2;
|
||||
for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) {
|
||||
if (Character.getType((char)i) == cat) {
|
||||
if ((end+1) == i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
set.add(start, end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
set.add(start, end);
|
||||
}
|
||||
CATEGORY_CACHE[cat] = set;
|
||||
}
|
||||
return CATEGORY_CACHE[cat];
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $
|
||||
* $Date: 2001/10/10 20:23:27 $
|
||||
* $Revision: 1.52 $
|
||||
* $Date: 2001/10/17 19:19:00 $
|
||||
* $Revision: 1.53 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -1318,7 +1318,7 @@ public class TransliteratorTest extends TestFmwk {
|
|||
public TestFact(String theID) {
|
||||
id = theID;
|
||||
}
|
||||
public Transliterator getInstance() {
|
||||
public Transliterator getInstance(String ignoredID) {
|
||||
return new NameableNullTrans(id);
|
||||
}
|
||||
};
|
||||
|
@ -1533,6 +1533,15 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test new property set syntax
|
||||
*/
|
||||
public void TestPropertySet() {
|
||||
expect("a>A; \\p{Lu}>x; \\p{ALL}>y;", "abcDEF", "Ayyxxx");
|
||||
expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
|
||||
"[ a stitch ]\n[ in time ]\r[ saves 9]");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// icu4j ONLY
|
||||
// These tests are not mirrored (yet) in icu4c at
|
||||
|
@ -1551,6 +1560,10 @@ public class TransliteratorTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
|
||||
public void TestDebugIndic() {
|
||||
expect("'-'h\\u0323>a;", "-h\u0323", "a");
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// Ram's tests
|
||||
//======================================================================
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/UnicodeSetTest.java,v $
|
||||
* $Date: 2001/10/10 21:35:33 $
|
||||
* $Revision: 1.13 $
|
||||
* $Date: 2001/10/17 19:17:59 $
|
||||
* $Revision: 1.14 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -52,7 +52,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
// not used int TOP = 0x200; // Don't need to go over the whole range:
|
||||
set = new UnicodeSet("[:L:]");
|
||||
for (int i=0; i<0x200; ++i) {
|
||||
boolean l = Character.isLetter((char)i);
|
||||
boolean l = UCharacter.isLetter(i);
|
||||
if (l != set.contains((char)i)) {
|
||||
errln("FAIL: L contains " + (char)i + " = " +
|
||||
set.contains((char)i));
|
||||
|
@ -62,7 +62,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
|
||||
set = new UnicodeSet("[:Lu:]");
|
||||
for (int i=0; i<0x200; ++i) {
|
||||
boolean lu = (Character.getType((char)i) == Character.UPPERCASE_LETTER);
|
||||
boolean lu = (UCharacter.getType(i) == UCharacterCategory.UPPERCASE_LETTER);
|
||||
if (lu != set.contains((char)i)) {
|
||||
errln("FAIL: Lu contains " + (char)i + " = " +
|
||||
set.contains((char)i));
|
||||
|
@ -249,11 +249,13 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
/**
|
||||
* Test the [:Latin:] syntax.
|
||||
*/
|
||||
public void TestScriptSet() {
|
||||
public void TestPropertySet() {
|
||||
UnicodeSet set = new UnicodeSet("[:Latin:]");
|
||||
expectContainment(set, "aA", "\u0391\u03B1");
|
||||
set = new UnicodeSet("[:Greek:]");
|
||||
set = new UnicodeSet("[\\p{Greek}]");
|
||||
expectContainment(set, "\u0391\u03B1", "aA");
|
||||
set = new UnicodeSet("\\P{ GENERAL Category = upper case letter }");
|
||||
expectContainment(set, "abc", "ABC");
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -453,7 +455,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
if (bad.length() > 0) {
|
||||
logln(Utility.escape("Fail: set " + set + " does not contain " + bad +
|
||||
errln(Utility.escape("FAIL: set " + set + " does not contain " + bad +
|
||||
", expected containment of " + charsIn));
|
||||
} else {
|
||||
logln(Utility.escape("Ok: set " + set + " contains " + charsIn));
|
||||
|
@ -468,7 +470,7 @@ public class UnicodeSetTest extends TestFmwk {
|
|||
}
|
||||
}
|
||||
if (bad.length() > 0) {
|
||||
logln(Utility.escape("Fail: set " + set + " contains " + bad +
|
||||
errln(Utility.escape("FAIL: set " + set + " contains " + bad +
|
||||
", expected non-containment of " + charsOut));
|
||||
} else {
|
||||
logln(Utility.escape("Ok: set " + set + " does not contain " + charsOut));
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/Quantifier.java,v $
|
||||
* $Date: 2001/10/04 18:24:15 $
|
||||
* $Revision: 1.1 $
|
||||
* $Date: 2001/10/17 19:17:06 $
|
||||
* $Revision: 1.2 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -45,9 +45,15 @@ class Quantifier implements UnicodeMatcher {
|
|||
int start = offset[0];
|
||||
int count = 0;
|
||||
while (count < maxCount) {
|
||||
int pos = offset[0];
|
||||
int m = matcher.matches(text, offset, limit, incremental);
|
||||
if (m == U_MATCH) {
|
||||
++count;
|
||||
if (pos == offset[0]) {
|
||||
// If offset has not moved we have a zero-width match.
|
||||
// Don't keep matching it infinitely.
|
||||
break;
|
||||
}
|
||||
} else if (incremental && m == U_PARTIAL_MATCH) {
|
||||
return U_PARTIAL_MATCH;
|
||||
} else {
|
||||
|
|
|
@ -1,3 +1,13 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/TransliteratorParser.java,v $
|
||||
* $Date: 2001/10/17 19:17:06 $
|
||||
* $Revision: 1.4 $
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.text;
|
||||
|
||||
import com.ibm.text.resources.ResourceReader;
|
||||
|
@ -85,6 +95,13 @@ class TransliteratorParser {
|
|||
*/
|
||||
private String undefinedVariableName;
|
||||
|
||||
/**
|
||||
* The stand-in character for the 'dot' set, represented by '.' in
|
||||
* patterns. This is allocated the first time it is needed, and
|
||||
* reused thereafter.
|
||||
*/
|
||||
private int dotStandIn = -1;
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Constants
|
||||
//----------------------------------------------------------------------
|
||||
|
@ -109,8 +126,6 @@ class TransliteratorParser {
|
|||
|
||||
private static final char CONTEXT_ANTE = '{'; // ante{key
|
||||
private static final char CONTEXT_POST = '}'; // key}post
|
||||
private static final char SET_OPEN = '[';
|
||||
private static final char SET_CLOSE = ']';
|
||||
private static final char CURSOR_POS = '|';
|
||||
private static final char CURSOR_OFFSET = '@';
|
||||
private static final char ANCHOR_START = '^';
|
||||
|
@ -119,6 +134,9 @@ class TransliteratorParser {
|
|||
private static final char ONE_OR_MORE = '+';
|
||||
private static final char ZERO_OR_ONE = '?';
|
||||
|
||||
private static final char DOT = '.';
|
||||
private static final String DOT_SET = "[^[:Zp:][:Zl:]\r\n$]";
|
||||
|
||||
// By definition, the ANCHOR_END special character is a
|
||||
// trailing SymbolTable.SYMBOL_REF character.
|
||||
// private static final char ANCHOR_END = '$';
|
||||
|
@ -541,6 +559,15 @@ class TransliteratorParser {
|
|||
// Text after a presumed end anchor is a syntax err
|
||||
syntaxError("Malformed variable reference", rule, start);
|
||||
}
|
||||
if (UnicodeSet.resemblesPattern(rule, pos-1)) {
|
||||
if (pp == null) {
|
||||
pp = new ParsePosition(0);
|
||||
}
|
||||
pp.setIndex(pos-1); // Backup to opening '['
|
||||
buf.append(parser.parseSet(rule, pp));
|
||||
pos = pp.getIndex();
|
||||
continue;
|
||||
}
|
||||
// Handle escapes
|
||||
if (c == ESCAPE) {
|
||||
if (pos == limit) {
|
||||
|
@ -682,14 +709,6 @@ class TransliteratorParser {
|
|||
}
|
||||
post = buf.length();
|
||||
break;
|
||||
case SET_OPEN:
|
||||
if (pp == null) {
|
||||
pp = new ParsePosition(0);
|
||||
}
|
||||
pp.setIndex(pos-1); // Backup to opening '['
|
||||
buf.append(parser.parseSet(rule, pp));
|
||||
pos = pp.getIndex();
|
||||
break;
|
||||
case CURSOR_POS:
|
||||
if (cursor >= 0) {
|
||||
syntaxError("Multiple cursors", rule, start);
|
||||
|
@ -718,6 +737,9 @@ class TransliteratorParser {
|
|||
}
|
||||
}
|
||||
break;
|
||||
case DOT:
|
||||
buf.append(parser.getDotStandIn());
|
||||
break;
|
||||
case KLEENE_STAR:
|
||||
case ONE_OR_MORE:
|
||||
case ZERO_OR_ONE:
|
||||
|
@ -783,7 +805,6 @@ class TransliteratorParser {
|
|||
buf.append(parser.generateStandInFor(m));
|
||||
}
|
||||
break;
|
||||
// case SET_CLOSE:
|
||||
default:
|
||||
// Disallow unquoted characters other than [0-9A-Za-z]
|
||||
// in the printable ASCII range. These characters are
|
||||
|
@ -1357,6 +1378,17 @@ class TransliteratorParser {
|
|||
return variableNext++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the stand-in for the dot set. It is allocated the first
|
||||
* time and reused thereafter.
|
||||
*/
|
||||
char getDotStandIn() {
|
||||
if (dotStandIn == -1) {
|
||||
dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
|
||||
}
|
||||
return (char) dotStandIn;
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the value of the given variable name to the given
|
||||
* StringBuffer.
|
||||
|
|
590
icu4j/src/com/ibm/text/UnicodePropertySet.java
Executable file
590
icu4j/src/com/ibm/text/UnicodePropertySet.java
Executable file
|
@ -0,0 +1,590 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodePropertySet.java,v $
|
||||
* $Date: 2001/10/17 19:17:06 $
|
||||
* $Revision: 1.1 $
|
||||
**********************************************************************
|
||||
*/
|
||||
package com.ibm.text;
|
||||
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
import com.ibm.util.Utility;
|
||||
|
||||
/**
|
||||
* INTERNAL CLASS implementing the UnicodeSet properties as outlined
|
||||
* at:
|
||||
*
|
||||
* http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
|
||||
*
|
||||
* Recognized syntax:
|
||||
*
|
||||
* [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
|
||||
* \p{foo} \P{foo} - white space not allowed within "\p" or "\P"
|
||||
*
|
||||
* Other than the above restrictions, white space is ignored. Case
|
||||
* is ignored except in "\p" and "\P".
|
||||
*
|
||||
* This class cannot be instantiated. It has a public static method,
|
||||
* createPropertySet(), with takes a pattern to be parsed and returns
|
||||
* a new UnicodeSet. Another public static method,
|
||||
* resemblesPattern(), returns true if a given pattern string appears
|
||||
* to be a property set pattern, and therefore should be passed in to
|
||||
* createPropertySet().
|
||||
*
|
||||
* NOTE: Current implementation is incomplete. The following list
|
||||
* indicates which properties are supported.
|
||||
*
|
||||
* + GeneralCategory
|
||||
* CombiningClass
|
||||
* BidiClass
|
||||
* DecompositionType
|
||||
* + NumericValue
|
||||
* NumericType
|
||||
* EastAsianWidth
|
||||
* LineBreak
|
||||
* JoiningType
|
||||
* + Script
|
||||
*
|
||||
* '+' indicates a supported property.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodePropertySet.java,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:17:06 $
|
||||
*/
|
||||
class UnicodePropertySet {
|
||||
|
||||
private static final Hashtable NAME_MAP = new Hashtable();
|
||||
|
||||
private static final Hashtable CATEGORY_MAP = new Hashtable();
|
||||
|
||||
/**
|
||||
* A cache mapping character category integers, as returned by
|
||||
* UCharacter.getType(), to sets. Entries are initially
|
||||
* null and are created on demand.
|
||||
*/
|
||||
private static final UnicodeSet[] CATEGORY_CACHE =
|
||||
new UnicodeSet[UCharacterCategory.CHAR_CATEGORY_COUNT];
|
||||
|
||||
/**
|
||||
* A cache mapping script integers, as defined by
|
||||
* UScript, to sets. Entries are initially
|
||||
* null and are created on demand.
|
||||
*/
|
||||
private static final UnicodeSet[] SCRIPT_CACHE =
|
||||
new UnicodeSet[UScript.CODE_LIMIT];
|
||||
|
||||
// Special value codes
|
||||
private static final int ANY = -1; // general category: all code points
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Public API
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Return true if the given position, in the given pattern, appears
|
||||
* to be the start of a property set pattern [:foo:], \p{foo}, or
|
||||
* \P{foo}.
|
||||
*/
|
||||
public static boolean resemblesPattern(String pattern, int pos) {
|
||||
// Patterns are at least 5 characters long
|
||||
if ((pos+5) > pattern.length()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look for an opening [:, [:^, \p, or \P
|
||||
return pattern.regionMatches(pos, "[:", 0, 2) ||
|
||||
pattern.regionMatches(true, pos, "\\p", 0, 2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a UnicodeSet by parsing the given pattern at the given
|
||||
* parse position.
|
||||
*
|
||||
* @param pattern the pattern string
|
||||
* @param ppos on entry, the position at which to begin parsing.
|
||||
* This shold be one of the locations marked '^':
|
||||
*
|
||||
* [:blah:] \p{blah} \P{blah}
|
||||
* ^ % ^ % ^ %
|
||||
*
|
||||
* On return, the position after the last character parsed, that is,
|
||||
* the locations marked '%'. If the parse fails, ppos is returned
|
||||
* unchanged.
|
||||
* @return a newly-constructed UnicodeSet object, or null upon
|
||||
* failure.
|
||||
*/
|
||||
public static UnicodeSet createFromPattern(String pattern, ParsePosition ppos) {
|
||||
|
||||
UnicodeSet set = null;
|
||||
|
||||
int pos = ppos.getIndex();
|
||||
|
||||
// On entry, ppos should point to one of the following locations:
|
||||
|
||||
// Minimum length is 5 characters, e.g. \p{L}
|
||||
if ((pos+5) > pattern.length()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
boolean posix = false; // true for [:pat:], false for \p{pat} \P{pat}
|
||||
boolean invert = false;
|
||||
|
||||
// Look for an opening [:, [:^, \p, or \P
|
||||
if (pattern.regionMatches(pos, "[:", 0, 2)) {
|
||||
posix = true;
|
||||
pos = skipWhitespace(pattern, pos+2);
|
||||
if (pos < pattern.length() && pattern.charAt(pos) == '^') {
|
||||
++pos;
|
||||
invert = true;
|
||||
}
|
||||
} else if (pattern.regionMatches(true, pos, "\\p", 0, 2)) {
|
||||
invert = (pattern.charAt(pos+1) == 'P');
|
||||
pos = skipWhitespace(pattern, pos+2);
|
||||
if (pos == pattern.length() || pattern.charAt(pos++) != '{') {
|
||||
// Syntax error; "\p" or "\P" not followed by "{"
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
// Open delimiter not seen
|
||||
return null;
|
||||
}
|
||||
|
||||
// Look for the matching close delimiter, either :] or }
|
||||
int close = pattern.indexOf(posix ? ":]" : "}", pos);
|
||||
if (close < 0) {
|
||||
// Syntax error; close delimiter missing
|
||||
return null;
|
||||
}
|
||||
|
||||
// Look for an '=' sign. If this is present, we will parse a
|
||||
// medium \p{gc=Cf} or long \p{GeneralCategory=Format}
|
||||
// pattern.
|
||||
int equals = pattern.indexOf('=', pos);
|
||||
if (equals >= 0 && equals < close) {
|
||||
// Equals seen; parse medium/long pattern
|
||||
String typeName = munge(pattern, pos, equals);
|
||||
String valueName = munge(pattern, equals+1, close);
|
||||
SetFactory factory;
|
||||
factory = (SetFactory) NAME_MAP.get(typeName);
|
||||
if (factory == null) {
|
||||
// Syntax error; type name not recognized
|
||||
return null;
|
||||
}
|
||||
set = factory.create(valueName);
|
||||
} else {
|
||||
// No equals seen; parse short format \p{Cf}
|
||||
String shortName = munge(pattern, pos, close);
|
||||
|
||||
// First try general category
|
||||
set = createCategorySet(shortName);
|
||||
|
||||
// If this fails, try script
|
||||
if (set == null) {
|
||||
set = createScriptSet(shortName);
|
||||
}
|
||||
}
|
||||
|
||||
if (invert) {
|
||||
set.complement();
|
||||
}
|
||||
|
||||
// Move to the limit position after the close delimiter
|
||||
ppos.setIndex(close + (posix ? 2 : 1));
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Property set factory classes
|
||||
// NOTE: This will change/go away when we implement UCharacter
|
||||
// based property retrieval.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
static interface SetFactory {
|
||||
|
||||
UnicodeSet create(String valueName);
|
||||
}
|
||||
|
||||
static class NumericValueFactory implements SetFactory {
|
||||
NumericValueFactory() {}
|
||||
public UnicodeSet create(String valueName) {
|
||||
double value = Double.parseDouble(valueName);
|
||||
final int ivalue = (int) value;
|
||||
if (ivalue != value || ivalue < 0) {
|
||||
// UCharacter doesn't support negative or non-integral
|
||||
// values, so just return an empty set
|
||||
return new UnicodeSet();
|
||||
}
|
||||
return createSetFromFilter(new Filter() {
|
||||
public boolean contains(int cp) {
|
||||
return UCharacter.getUnicodeNumericValue(cp) == ivalue;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Property set factory static methods
|
||||
// NOTE: This will change/go away when we implement UCharacter
|
||||
// based property retrieval.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Given a general category value name, create a corresponding
|
||||
* set and return it, or return null if the name is invalid.
|
||||
* @param valueName a pre-munged general category value name
|
||||
*/
|
||||
private static UnicodeSet createCategorySet(String valueName) {
|
||||
Integer valueObj;
|
||||
valueObj = (Integer) CATEGORY_MAP.get(valueName);
|
||||
if (valueObj == null) {
|
||||
return null;
|
||||
}
|
||||
int valueCode = valueObj.intValue();
|
||||
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
if (valueCode == ANY) {
|
||||
set.complement();
|
||||
return set;
|
||||
}
|
||||
for (int cat=0; cat<UCharacterCategory.CHAR_CATEGORY_COUNT; ++cat) {
|
||||
if ((valueCode & (1 << cat)) != 0) {
|
||||
set.addAll(UnicodePropertySet.getCategorySet(cat));
|
||||
}
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a script value name, create a corresponding set and
|
||||
* return it, or return null if the name is invalid.
|
||||
* @param valueName a pre-munged script value name
|
||||
*/
|
||||
private static UnicodeSet createScriptSet(String valueName) {
|
||||
int script = UScript.getCode(valueName);
|
||||
if (script == UScript.INVALID_CODE) {
|
||||
// Syntax error; unknown short name
|
||||
return null;
|
||||
}
|
||||
return new UnicodeSet(getScriptSet(script));
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Returns a UnicodeSet for the given category. This set is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
private static UnicodeSet getCategorySet(final int cat) {
|
||||
if (CATEGORY_CACHE[cat] == null) {
|
||||
CATEGORY_CACHE[cat] =
|
||||
createSetFromFilter(new Filter() {
|
||||
public boolean contains(int cp) {
|
||||
return UCharacter.getType(cp) == cat;
|
||||
}
|
||||
});
|
||||
}
|
||||
return CATEGORY_CACHE[cat];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a UnicodeSet for the given script. This set is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
private static UnicodeSet getScriptSet(final int script) {
|
||||
if (SCRIPT_CACHE[script] == null) {
|
||||
SCRIPT_CACHE[script] =
|
||||
createSetFromFilter(new Filter() {
|
||||
public boolean contains(int cp) {
|
||||
return UScript.getScript(cp) == script;
|
||||
}
|
||||
});
|
||||
}
|
||||
return SCRIPT_CACHE[script];
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a string, munge it to upper case and lose the whitespace.
|
||||
* So "General Category " becomes "GENERALCATEGORY". We munge all
|
||||
* type and value strings, and store all type and value keys
|
||||
* pre-munged.
|
||||
*/
|
||||
private static String munge(String str, int start, int limit) {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
for (int i=start; i<limit; ) {
|
||||
int c = UTF16.charAt(str, i);
|
||||
i += UTF16.getCharCount(c);
|
||||
if (!UCharacter.isWhitespace(c)) {
|
||||
UTF16.append(buf, UCharacter.toUpperCase(c));
|
||||
}
|
||||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Skip over a sequence of zero or more white space characters
|
||||
* at pos. Return the index of the first non-white-space character
|
||||
* at or after pos, or str.length(), if there is none.
|
||||
*/
|
||||
private static int skipWhitespace(String str, int pos) {
|
||||
while (pos < str.length()) {
|
||||
int c = UTF16.charAt(str, pos);
|
||||
if (!UCharacter.isWhitespace(c)) {
|
||||
break;
|
||||
}
|
||||
pos += UTF16.getCharCount(c);
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Generic filter-based scanning code
|
||||
//
|
||||
// NOTE: In general, we don't want to do this! This is a temporary
|
||||
// implementation until we have time for something that examines
|
||||
// the underlying UCharacter data structures in an intelligent
|
||||
// way. Iterating over all code points is dumb. What we want to
|
||||
// do, for instance, is iterate over internally-stored ranges
|
||||
// of characters that have a given property.
|
||||
//----------------------------------------------------------------
|
||||
|
||||
static interface Filter {
|
||||
boolean contains(int codePoint);
|
||||
}
|
||||
|
||||
static UnicodeSet createSetFromFilter(Filter filter) {
|
||||
// Walk through all Unicode characters, noting the start
|
||||
// and end of each range for which filter.contain(c) is
|
||||
// true. Add each range to a set.
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
int start = -1;
|
||||
int end = -2;
|
||||
|
||||
// TODO Extend this up to UnicodeSet.MAX_VALUE when we have
|
||||
// better performance; i.e., when this code can get moved into
|
||||
// the UCharacter class and not have to iterate over code
|
||||
// points. Right now it's way too slow to iterate to 10FFFF.
|
||||
|
||||
for (int i=UnicodeSet.MIN_VALUE; i<=0xFFFF; ++i) {
|
||||
if (filter.contains(i)) {
|
||||
if ((end+1) == i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
set.add(start, end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
set.add(start, end);
|
||||
}
|
||||
return set;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Type and value name maps
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Add a type mapping to the name map.
|
||||
*/
|
||||
private static void addType(String shortName, String longName,
|
||||
SetFactory factory) {
|
||||
// DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD
|
||||
if (true) {
|
||||
if (NAME_MAP.get(shortName) != null) {
|
||||
throw new InternalError("Duplicate name " + shortName);
|
||||
}
|
||||
if (NAME_MAP.get(longName) != null) {
|
||||
throw new InternalError("Duplicate name " + longName);
|
||||
}
|
||||
}
|
||||
|
||||
NAME_MAP.put(shortName, factory);
|
||||
NAME_MAP.put(longName, factory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a value mapping to the name map.
|
||||
*/
|
||||
private static void addValue(Hashtable map,
|
||||
String shortName, String longName,
|
||||
int value) {
|
||||
// DEBUGGING CODE: DISABLE FOR PRODUCTION BUILD
|
||||
if (true) {
|
||||
if (map.get(shortName) != null) {
|
||||
throw new InternalError("Duplicate name " + shortName);
|
||||
}
|
||||
if (longName != null && map.get(longName) != null) {
|
||||
throw new InternalError("Duplicate name " + longName);
|
||||
}
|
||||
}
|
||||
|
||||
Integer valueObj = new Integer(value);
|
||||
map.put(shortName, valueObj);
|
||||
if (longName != null) {
|
||||
map.put(longName, valueObj);
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
// NOTE: We munge all search keys to have no whitespace
|
||||
// and upper case. As such, all stored keys should have
|
||||
// this format.
|
||||
|
||||
// Load the map with type data
|
||||
|
||||
addType("GC", "GENERALCATEGORY", new SetFactory() {
|
||||
public UnicodeSet create(String valueName) {
|
||||
return createCategorySet(valueName);
|
||||
}
|
||||
});
|
||||
|
||||
//addType("CC", "COMBININGCLASS", COMBINING_CLASS);
|
||||
//addType("BC", "BIDICLASS", BIDI_CLASS);
|
||||
//addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);
|
||||
|
||||
addType("NV", "NUMERICVALUE", new NumericValueFactory());
|
||||
|
||||
//addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
|
||||
//addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
|
||||
//addType("LB", "LINEBREAK", LINE_BREAK);
|
||||
//addType("JT", "JOININGTYPE", JOINING_TYPE);
|
||||
|
||||
addType("SC", "SCRIPT", new SetFactory() {
|
||||
public UnicodeSet create(String valueName) {
|
||||
return createScriptSet(valueName);
|
||||
}
|
||||
});
|
||||
|
||||
// Load the map with value data
|
||||
|
||||
// General Category
|
||||
|
||||
addValue(CATEGORY_MAP, "ANY", null, ANY); // special case
|
||||
|
||||
addValue(CATEGORY_MAP, "C", "OTHER",
|
||||
(1 << UCharacterCategory.CONTROL) |
|
||||
(1 << UCharacterCategory.FORMAT) |
|
||||
(1 << UCharacterCategory.GENERAL_OTHER_TYPES) |
|
||||
(1 << UCharacterCategory.PRIVATE_USE) |
|
||||
(1 << UCharacterCategory.SURROGATE));
|
||||
|
||||
addValue(CATEGORY_MAP, "CC", "CONTROL",
|
||||
1 << UCharacterCategory.CONTROL);
|
||||
addValue(CATEGORY_MAP, "CF", "FORMAT",
|
||||
1 << UCharacterCategory.FORMAT);
|
||||
addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
|
||||
1 << UCharacterCategory.GENERAL_OTHER_TYPES);
|
||||
addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
|
||||
1 << UCharacterCategory.PRIVATE_USE);
|
||||
addValue(CATEGORY_MAP, "CS", "SURROGATE",
|
||||
1 << UCharacterCategory.SURROGATE);
|
||||
|
||||
addValue(CATEGORY_MAP, "L", "LETTER",
|
||||
(1 << UCharacterCategory.LOWERCASE_LETTER) |
|
||||
(1 << UCharacterCategory.MODIFIER_LETTER) |
|
||||
(1 << UCharacterCategory.OTHER_LETTER) |
|
||||
(1 << UCharacterCategory.TITLECASE_LETTER) |
|
||||
(1 << UCharacterCategory.UPPERCASE_LETTER));
|
||||
|
||||
addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
|
||||
1 << UCharacterCategory.LOWERCASE_LETTER);
|
||||
addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
|
||||
1 << UCharacterCategory.MODIFIER_LETTER);
|
||||
addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
|
||||
1 << UCharacterCategory.OTHER_LETTER);
|
||||
addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
|
||||
1 << UCharacterCategory.TITLECASE_LETTER);
|
||||
addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
|
||||
1 << UCharacterCategory.UPPERCASE_LETTER);
|
||||
|
||||
addValue(CATEGORY_MAP, "M", "MARK",
|
||||
(1 << UCharacterCategory.NON_SPACING_MARK) |
|
||||
(1 << UCharacterCategory.COMBINING_SPACING_MARK) |
|
||||
(1 << UCharacterCategory.ENCLOSING_MARK));
|
||||
|
||||
addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
|
||||
1 << UCharacterCategory.NON_SPACING_MARK);
|
||||
addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
|
||||
1 << UCharacterCategory.COMBINING_SPACING_MARK);
|
||||
addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
|
||||
1 << UCharacterCategory.ENCLOSING_MARK);
|
||||
|
||||
addValue(CATEGORY_MAP, "N", "NUMBER",
|
||||
(1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER) |
|
||||
(1 << UCharacterCategory.LETTER_NUMBER) |
|
||||
(1 << UCharacterCategory.OTHER_NUMBER));
|
||||
|
||||
addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
|
||||
1 << UCharacterCategory.DECIMAL_DIGIT_NUMBER);
|
||||
addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
|
||||
1 << UCharacterCategory.LETTER_NUMBER);
|
||||
addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
|
||||
1 << UCharacterCategory.OTHER_NUMBER);
|
||||
|
||||
addValue(CATEGORY_MAP, "P", "PUNCTUATION",
|
||||
(1 << UCharacterCategory.CONNECTOR_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.DASH_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.END_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.FINAL_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.INITIAL_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.OTHER_PUNCTUATION) |
|
||||
(1 << UCharacterCategory.START_PUNCTUATION));
|
||||
|
||||
addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
|
||||
1 << UCharacterCategory.CONNECTOR_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
|
||||
1 << UCharacterCategory.DASH_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
|
||||
1 << UCharacterCategory.END_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
|
||||
1 << UCharacterCategory.FINAL_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
|
||||
1 << UCharacterCategory.INITIAL_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
|
||||
1 << UCharacterCategory.OTHER_PUNCTUATION);
|
||||
addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
|
||||
1 << UCharacterCategory.START_PUNCTUATION);
|
||||
|
||||
addValue(CATEGORY_MAP, "S", "SYMBOL",
|
||||
(1 << UCharacterCategory.CURRENCY_SYMBOL) |
|
||||
(1 << UCharacterCategory.MODIFIER_SYMBOL) |
|
||||
(1 << UCharacterCategory.MATH_SYMBOL) |
|
||||
(1 << UCharacterCategory.OTHER_SYMBOL));
|
||||
|
||||
addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
|
||||
1 << UCharacterCategory.CURRENCY_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
|
||||
1 << UCharacterCategory.MODIFIER_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
|
||||
1 << UCharacterCategory.MATH_SYMBOL);
|
||||
addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
|
||||
1 << UCharacterCategory.OTHER_SYMBOL);
|
||||
|
||||
addValue(CATEGORY_MAP, "Z", "SEPARATOR",
|
||||
(1 << UCharacterCategory.LINE_SEPARATOR) |
|
||||
(1 << UCharacterCategory.PARAGRAPH_SEPARATOR) |
|
||||
(1 << UCharacterCategory.SPACE_SEPARATOR));
|
||||
|
||||
addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
|
||||
1 << UCharacterCategory.LINE_SEPARATOR);
|
||||
addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
|
||||
1 << UCharacterCategory.PARAGRAPH_SEPARATOR);
|
||||
addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
|
||||
1 << UCharacterCategory.SPACE_SEPARATOR);
|
||||
}
|
||||
}
|
|
@ -5,8 +5,8 @@
|
|||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
|
||||
* $Date: 2001/10/10 21:35:05 $
|
||||
* $Revision: 1.39 $
|
||||
* $Date: 2001/10/17 19:17:06 $
|
||||
* $Revision: 1.40 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
|
@ -202,60 +202,26 @@ import com.ibm.util.Utility;
|
|||
* starting wih 'L', that is, <code>[[:Lu:][:Ll:][:Lt:][:Lm:][:Lo:]]</code>.
|
||||
* </table>
|
||||
*
|
||||
* <p><b>Character categories.</b>
|
||||
* <p><b>Character properties.</b>
|
||||
*
|
||||
* Character categories are specified using the POSIX-like syntax
|
||||
* '[:Lu:]'. The complement of a category is specified by inserting
|
||||
* '^' after the opening '[:'. The following category names are
|
||||
* recognized. Actual determination of category data uses
|
||||
* <code>Character.getType()</code>, so it reflects the underlying
|
||||
* implmementation used by <code>Character</code>. As of Java 2 and
|
||||
* JDK 1.1.8, this is Unicode 2.1.2.
|
||||
* <p>Character properties are specified using the POSIX-like syntax
|
||||
* "[:Lu:]" or the Perl-like syntax "\p{Lu}". The complement of a
|
||||
* category is specified as "[:^Lu:]" or "\P{Lu}". Actual
|
||||
* determination of category data is accomplished by UCharacter using
|
||||
* the underlying Unicode database.
|
||||
*
|
||||
* <pre>
|
||||
* Normative
|
||||
* Mn = Mark, Non-Spacing
|
||||
* Mc = Mark, Spacing Combining
|
||||
* Me = Mark, Enclosing
|
||||
* <p>For details of the property syntax please see this
|
||||
* <a href="http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html">
|
||||
* draft document</a>.
|
||||
*
|
||||
* Nd = Number, Decimal Digit
|
||||
* Nl = Number, Letter
|
||||
* No = Number, Other
|
||||
*
|
||||
* Zs = Separator, Space
|
||||
* Zl = Separator, Line
|
||||
* Zp = Separator, Paragraph
|
||||
*
|
||||
* Cc = Other, Control
|
||||
* Cf = Other, Format
|
||||
* Cs = Other, Surrogate
|
||||
* Co = Other, Private Use
|
||||
* Cn = Other, Not Assigned
|
||||
*
|
||||
* Informative
|
||||
* Lu = Letter, Uppercase
|
||||
* Ll = Letter, Lowercase
|
||||
* Lt = Letter, Titlecase
|
||||
* Lm = Letter, Modifier
|
||||
* Lo = Letter, Other
|
||||
*
|
||||
* Pc = Punctuation, Connector
|
||||
* Pd = Punctuation, Dash
|
||||
* Ps = Punctuation, Open
|
||||
* Pe = Punctuation, Close
|
||||
* *Pi = Punctuation, Initial quote
|
||||
* *Pf = Punctuation, Final quote
|
||||
* Po = Punctuation, Other
|
||||
*
|
||||
* Sm = Symbol, Math
|
||||
* Sc = Symbol, Currency
|
||||
* Sk = Symbol, Modifier
|
||||
* So = Symbol, Other
|
||||
* </pre>
|
||||
* *Unsupported by Java (and hence unsupported by UnicodeSet).
|
||||
* <p><em>Note:</em> Not all properties are currently supported.
|
||||
* Currently, only the general category, script, and numeric value
|
||||
* properties are supported. Support for other properties will be
|
||||
* added in the future.
|
||||
*
|
||||
* @author Alan Liu
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.39 $ $Date: 2001/10/10 21:35:05 $ */
|
||||
* @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.40 $ $Date: 2001/10/17 19:17:06 $
|
||||
*/
|
||||
public class UnicodeSet extends UnicodeFilter {
|
||||
|
||||
/* Implementation Notes.
|
||||
|
@ -313,29 +279,12 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
* modified using the non-pattern API, this string will be null,
|
||||
* indicating that toPattern() must generate a pattern
|
||||
* representation from the inversion list.
|
||||
*/
|
||||
*/
|
||||
private String pat = null;
|
||||
|
||||
private static final int START_EXTRA = 16; // initial storage. Must be >= 0
|
||||
private static final int GROW_EXTRA = START_EXTRA; // extra amount for growth. Must be >= 0
|
||||
|
||||
private static final String CATEGORY_NAMES =
|
||||
// 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
|
||||
//0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8
|
||||
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
|
||||
|
||||
private static final int UNSUPPORTED_CATEGORY = 17;
|
||||
|
||||
private static final int CATEGORY_COUNT = 29;
|
||||
|
||||
/**
|
||||
* A cache mapping character category integers, as returned by
|
||||
* Character.getType(), to inversion lists. Entries are initially
|
||||
* null and are created on demand.
|
||||
*/
|
||||
private static final UnicodeSet[] CATEGORY_CACHE =
|
||||
new UnicodeSet[CATEGORY_COUNT];
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Public API
|
||||
//----------------------------------------------------------------
|
||||
|
@ -408,19 +357,27 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
applyPattern(pattern, pos, symbols, true);
|
||||
}
|
||||
|
||||
private static final String CATEGORY_NAMES =
|
||||
// 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2
|
||||
//0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 8 9 0 1 2 3 4 5 6 7 8
|
||||
"CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCf--CoCsPdPsPePcPoSmScSkSo";
|
||||
/**
|
||||
* Constructs a set from the given Unicode character category.
|
||||
* DEPRECATED - Constructs a set from the given Unicode character
|
||||
* category.
|
||||
* @param category an integer indicating the character category as
|
||||
* returned by <code>Character.getType()</code>.
|
||||
* returned by <code>java.lang.Character.getType()</code>. Note
|
||||
* that this is <em>different</em> from the UCharacterCategory
|
||||
* codes.
|
||||
* @exception java.lang.IllegalArgumentException if the given
|
||||
* category is invalid.
|
||||
* @deprecated this will be removed Dec-31-2001
|
||||
*/
|
||||
public UnicodeSet(int category) {
|
||||
if (category < 0 || category >= CATEGORY_COUNT ||
|
||||
category == UNSUPPORTED_CATEGORY) {
|
||||
if (category < 0 || category > java.lang.Character.OTHER_SYMBOL ||
|
||||
category == 17) {
|
||||
throw new IllegalArgumentException("Invalid category");
|
||||
}
|
||||
set(getCategorySet(category));
|
||||
applyPattern(CATEGORY_NAMES.substring(2*category, 2*category+2), false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -489,6 +446,16 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if the given position, in the given pattern, appears
|
||||
* to be the start of a UnicodeSet pattern.
|
||||
*/
|
||||
public static boolean resemblesPattern(String pattern, int pos) {
|
||||
return ((pos+1) < pattern.length() &&
|
||||
pattern.charAt(pos) == '[') ||
|
||||
UnicodePropertySet.resemblesPattern(pattern, pos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the <code>toPattern()</code> representation of a
|
||||
* character to the given <code>StringBuffer</code>.
|
||||
|
@ -509,6 +476,8 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
case '^': // COMPLEMENT:
|
||||
case '&': // INTERSECTION:
|
||||
case '\\': //BACKSLASH:
|
||||
case '{':
|
||||
case '}':
|
||||
buf.append('\\');
|
||||
break;
|
||||
default:
|
||||
|
@ -607,28 +576,28 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
return _generatePattern(result, escapeUnprintable);
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate and append a string representation of this set to result.
|
||||
* This does not use this.pat, the cleaned up copy of the string
|
||||
* passed to applyPattern().
|
||||
* passed to applyPattern().
|
||||
*/
|
||||
public StringBuffer _generatePattern(StringBuffer result,
|
||||
boolean escapeUnprintable) {
|
||||
result.append('[');
|
||||
|
||||
// Check against the predefined categories. We implicitly build
|
||||
// up ALL category sets the first time toPattern() is called.
|
||||
for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
|
||||
if (this.equals(getCategorySet(cat))) {
|
||||
result.append(':');
|
||||
result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
|
||||
return result.append(":]");
|
||||
}
|
||||
}
|
||||
// // Check against the predefined categories. We implicitly build
|
||||
// // up ALL category sets the first time toPattern() is called.
|
||||
// for (int cat=0; cat<CATEGORY_COUNT; ++cat) {
|
||||
// if (this.equals(getCategorySet(cat))) {
|
||||
// result.append(':');
|
||||
// result.append(CATEGORY_NAMES.substring(cat*2, cat*2+2));
|
||||
// return result.append(":]");
|
||||
// }
|
||||
// }
|
||||
|
||||
int count = getRangeCount();
|
||||
|
||||
|
@ -1205,7 +1174,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
StringBuffer newPat = new StringBuffer("[");
|
||||
int nestedPatStart = -1; // see below for usage
|
||||
boolean nestedPatDone = false; // see below for usage
|
||||
|
||||
|
||||
boolean invert = false;
|
||||
clear();
|
||||
|
||||
|
@ -1231,8 +1200,9 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// mode 1: '[' seen; if next is '^' or ':' then special
|
||||
// mode 2: '[' '^'? seen; parse pattern and close with ']'
|
||||
// mode 3: '[:' seen; parse category and close with ':]'
|
||||
// mode 4: ']' seen; parse complete
|
||||
// mode 5: Top-level property pattern seen
|
||||
int mode = 0;
|
||||
int colonPos = 0; // Expected pos of ':' in '[:'
|
||||
int start = pos.getIndex();
|
||||
int i = start;
|
||||
int limit = pattern.length();
|
||||
|
@ -1285,9 +1255,11 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// Parse the opening '[' and optional following '^'
|
||||
switch (mode) {
|
||||
case 0:
|
||||
if (c == '[') {
|
||||
if (UnicodePropertySet.resemblesPattern(pattern, i-1)) {
|
||||
mode = 3;
|
||||
break; // Fall through
|
||||
} else if (c == '[') {
|
||||
mode = 1; // Next look for '^'
|
||||
colonPos = i; // Expect ':' at next offset
|
||||
continue;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Missing opening '['");
|
||||
|
@ -1299,17 +1271,6 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
invert = true;
|
||||
newPat.append((char) c);
|
||||
continue; // Back to top to fetch next character
|
||||
case ':':
|
||||
if (i-1 == colonPos) {
|
||||
// '[:' cannot have whitespace in it
|
||||
--i; // Backup to the '['
|
||||
c = '[';
|
||||
mode = 3;
|
||||
// Fall through and parse category using the same
|
||||
// code used to parse a nested category. The mode
|
||||
// will indicate that this is actually top level.
|
||||
}
|
||||
break; // Fall through
|
||||
case '-':
|
||||
isLiteral = true; // Treat leading '-' as a literal
|
||||
break; // Fall through
|
||||
|
@ -1326,12 +1287,47 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// buffer. Characters in the variable buffer have already
|
||||
// benn through escape and variable reference processing.
|
||||
if (varValueBuffer == null) {
|
||||
/**
|
||||
* Handle property set patterns.
|
||||
*/
|
||||
if (UnicodePropertySet.resemblesPattern(pattern, i-1)) {
|
||||
ParsePosition pp = new ParsePosition(i-1);
|
||||
nestedSet = UnicodePropertySet.createFromPattern(pattern, pp);
|
||||
if (nestedSet == null) {
|
||||
// assert(pp.getIndex() == i-1);
|
||||
throw new IllegalArgumentException("Invalid property pattern " +
|
||||
pattern.substring(i-1));
|
||||
}
|
||||
nestedPatStart = newPat.length();
|
||||
nestedPatDone = true; // we're going to do it just below
|
||||
|
||||
// If we have a top-level property pattern, then trim
|
||||
// off the opening '[' and use the property pattern
|
||||
// as the entire pattern.
|
||||
if (mode == 3) {
|
||||
newPat.deleteCharAt(0);
|
||||
}
|
||||
newPat.append(pattern.substring(i-1, pp.getIndex()));
|
||||
rebuildPattern = true;
|
||||
|
||||
i = pp.getIndex(); // advance past property pattern
|
||||
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse
|
||||
// loop. This is one of 2 ways we leave this
|
||||
// loop if the pattern is well-formed.
|
||||
set(nestedSet);
|
||||
mode = 5;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Handle escapes. If a character is escaped, then it assumes its
|
||||
* literal value. This is true for all characters, both special
|
||||
* characters and characters with no special meaning. We also
|
||||
* interpret '\\uxxxx' Unicode escapes here (as literals).
|
||||
*/
|
||||
if (c == '\\') {
|
||||
else if (c == '\\') {
|
||||
int[] offset = new int[] { i };
|
||||
int escaped = Utility.unescapeAt(pattern, offset);
|
||||
if (escaped == -1) {
|
||||
|
@ -1373,61 +1369,25 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
|
||||
/* An opening bracket indicates the first bracket of a nested
|
||||
* subpattern, either a normal pattern or a category pattern. We
|
||||
* recognize these here and set nestedSet accordingly.
|
||||
* subpattern.
|
||||
*/
|
||||
else if (!isLiteral && c == '[') {
|
||||
// Record position before nested pattern
|
||||
nestedPatStart = newPat.length();
|
||||
|
||||
// Handle "[:...:]", representing a character category
|
||||
if (i < pattern.length() && pattern.charAt(i) == ':') {
|
||||
++i;
|
||||
int j = pattern.indexOf(":]", i);
|
||||
if (j < 0) {
|
||||
throw new IllegalArgumentException("Missing \":]\"");
|
||||
}
|
||||
String scratch = pattern.substring(i, j);
|
||||
nestedSet = new UnicodeSet();
|
||||
nestedSet.applyCategory(scratch);
|
||||
nestedPatDone = true; // We're going to do it just below
|
||||
i = j+2; // Advance i past ":]"
|
||||
|
||||
// Use a rebuilt pattern. If we are top level,
|
||||
// then there is already a SET_OPEN in newPat, and
|
||||
// SET_CLOSE will be appended elsewhere.
|
||||
if (mode != 3) {
|
||||
newPat.append('[');
|
||||
}
|
||||
newPat.append(':').append(scratch).append(':');
|
||||
if (mode != 3) {
|
||||
newPat.append(']');
|
||||
}
|
||||
rebuildPattern = true;
|
||||
|
||||
if (mode == 3) {
|
||||
// Entire pattern is a category; leave parse
|
||||
// loop. This is one of 2 ways we leave this
|
||||
// loop if the pattern is well-formed.
|
||||
set(nestedSet);
|
||||
mode = 4;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Recurse to get the pairs for this nested set.
|
||||
// Backup i to '['.
|
||||
pos.setIndex(--i);
|
||||
switch (lastOp) {
|
||||
case '-':
|
||||
case '&':
|
||||
newPat.append(lastOp);
|
||||
break;
|
||||
}
|
||||
nestedSet = new UnicodeSet();
|
||||
nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
|
||||
nestedPatDone = true;
|
||||
i = pos.getIndex();
|
||||
// Recurse to get the pairs for this nested set.
|
||||
// Backup i to '['.
|
||||
pos.setIndex(--i);
|
||||
switch (lastOp) {
|
||||
case '-':
|
||||
case '&':
|
||||
newPat.append(lastOp);
|
||||
break;
|
||||
}
|
||||
nestedSet = new UnicodeSet();
|
||||
nestedSet._applyPattern(pattern, pos, symbols, newPat, ignoreWhitespace);
|
||||
nestedPatDone = true;
|
||||
i = pos.getIndex();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1487,7 +1447,7 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
// loop if the pattern is well-formed.
|
||||
if (anchor > 2 || anchor == 1) {
|
||||
throw new IllegalArgumentException("Syntax error near $" + pattern);
|
||||
|
||||
|
||||
}
|
||||
if (anchor == 2) {
|
||||
rebuildPattern = true;
|
||||
|
@ -1524,16 +1484,24 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
}
|
||||
|
||||
if (lastChar != NONE) {
|
||||
if (mode < 4) {
|
||||
throw new IllegalArgumentException("Missing ']'");
|
||||
}
|
||||
|
||||
// Treat a trailing '$' as indicating ETHER. This code is only
|
||||
// executed if symbols == NULL; otherwise other code parses the
|
||||
// anchor.
|
||||
if (lastChar == SymbolTable.SYMBOL_REF) {
|
||||
rebuildPattern = true;
|
||||
newPat.append(lastChar);
|
||||
add(TransliterationRule.ETHER);
|
||||
}
|
||||
|
||||
else if (lastChar != NONE) {
|
||||
add(lastChar, lastChar);
|
||||
_appendToPat(newPat, lastChar, false);
|
||||
}
|
||||
|
||||
// if (mode == 0) {
|
||||
// throw new IllegalArgumentException("Missing '[' in \"" +
|
||||
// pattern.substring(start) + '"');
|
||||
// }
|
||||
|
||||
// Handle unprocessed stuff preceding the closing ']'
|
||||
if (lastOp == '-') {
|
||||
// Trailing '-' is treated as literal
|
||||
|
@ -1543,7 +1511,9 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
throw new IllegalArgumentException("Unquoted trailing " + lastOp);
|
||||
}
|
||||
|
||||
newPat.append(']');
|
||||
if (mode == 4) {
|
||||
newPat.append(']');
|
||||
}
|
||||
|
||||
/**
|
||||
* If we saw a '^' after the initial '[' of this pattern, then perform
|
||||
|
@ -1553,21 +1523,6 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
complement();
|
||||
}
|
||||
|
||||
if (mode != 4) {
|
||||
throw new IllegalArgumentException("Missing ']'");
|
||||
}
|
||||
|
||||
// /**
|
||||
// * i indexes the last character we parsed or is pattern.length(). In
|
||||
// * the latter case, we have run off the end without finding a closing
|
||||
// * ']'. Otherwise, we know i < pattern.length(), and we set the
|
||||
// * ParsePosition to the next character to be parsed.
|
||||
// */
|
||||
// if (i == limit) {
|
||||
// throw new IllegalArgumentException("Missing ']' in \"" +
|
||||
// pattern.substring(start) + '"');
|
||||
// }
|
||||
|
||||
pos.setIndex(i);
|
||||
|
||||
// Use the rebuilt pattern (newPat) only if necessary. Prefer the
|
||||
|
@ -1586,136 +1541,6 @@ public class UnicodeSet extends UnicodeFilter {
|
|||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Generation of Unicode categories
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Sets this object to the given category, given its name.
|
||||
* The category name must be either a two-letter name, such as
|
||||
* "Lu", or a one letter name, such as "L". One-letter names
|
||||
* indicate the logical union of all two-letter names that start
|
||||
* with that letter. Case is significant. If the name starts
|
||||
* with the character '^' then the complement of the given
|
||||
* character set is returned.
|
||||
*
|
||||
* Although individual categories such as "Lu" are cached, we do
|
||||
* not currently cache single-letter categories such as "L" or
|
||||
* complements such as "^Lu" or "^L". It would be easy to cache
|
||||
* these as well in a hashtable should the need arise.
|
||||
*
|
||||
* NEW: The category name can now be a script name, as defined
|
||||
* by UScript.
|
||||
*/
|
||||
private void applyCategory(String catName) {
|
||||
boolean invert = (catName.length() > 1 &&
|
||||
catName.charAt(0) == '^');
|
||||
if (invert) {
|
||||
catName = catName.substring(1);
|
||||
}
|
||||
|
||||
boolean match = false;
|
||||
|
||||
// BE CAREFUL not to modify the return value from
|
||||
// getCategorySet(int).
|
||||
|
||||
// if we have two characters, search the category map for that
|
||||
// code and either construct and return a UnicodeSet from the
|
||||
// data in the category map or throw an exception
|
||||
if (catName.length() == 2) {
|
||||
int i = CATEGORY_NAMES.indexOf(catName);
|
||||
if (i>=0 && i%2==0) {
|
||||
i /= 2;
|
||||
if (i != UNSUPPORTED_CATEGORY) {
|
||||
set(getCategorySet(i));
|
||||
match = true;
|
||||
}
|
||||
}
|
||||
} else if (catName.length() == 1) {
|
||||
// if we have one character, search the category map for
|
||||
// codes beginning with that letter, and union together
|
||||
// all of the matching sets that we find (or throw an
|
||||
// exception if there are no matches)
|
||||
clear();
|
||||
for (int i=0; i<CATEGORY_COUNT; ++i) {
|
||||
if (i != UNSUPPORTED_CATEGORY &&
|
||||
CATEGORY_NAMES.charAt(2*i) == catName.charAt(0)) {
|
||||
addAll(getCategorySet(i));
|
||||
match = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!match) {
|
||||
// TODO: Add caching of these, if desired
|
||||
int script = UScript.getCode(catName);
|
||||
if (script != UScript.INVALID_CODE) {
|
||||
match = true;
|
||||
clear();
|
||||
int start = -1;
|
||||
int end = -2;
|
||||
for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) {
|
||||
if (UScript.getScript(i) == script) {
|
||||
if ((end+1) == i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
add(start, end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
add(start, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!match) {
|
||||
throw new IllegalArgumentException("Illegal category [:" + catName + ":]");
|
||||
}
|
||||
|
||||
if (invert) {
|
||||
complement();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an inversion list for the given category. This list is
|
||||
* cached and returned again if this method is called again with
|
||||
* the same parameter.
|
||||
*
|
||||
* Callers MUST NOT MODIFY the returned set.
|
||||
*/
|
||||
private static UnicodeSet getCategorySet(int cat) {
|
||||
if (CATEGORY_CACHE[cat] == null) {
|
||||
// Walk through all Unicode characters, noting the start
|
||||
// and end of each range for which Character.getType(c)
|
||||
// returns the given category integer.
|
||||
UnicodeSet set = new UnicodeSet();
|
||||
int start = -1;
|
||||
int end = -2;
|
||||
for (int i=MIN_VALUE; i<=MAX_VALUE; ++i) {
|
||||
if (Character.getType((char)i) == cat) {
|
||||
if ((end+1) == i) {
|
||||
end = i;
|
||||
} else {
|
||||
if (start >= 0) {
|
||||
set.add(start, end);
|
||||
}
|
||||
start = end = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (start >= 0) {
|
||||
set.add(start, end);
|
||||
}
|
||||
CATEGORY_CACHE[cat] = set;
|
||||
}
|
||||
return CATEGORY_CACHE[cat];
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: Utility methods
|
||||
//----------------------------------------------------------------
|
||||
|
|
Loading…
Add table
Reference in a new issue