diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in
index 4a2923c1fdc..b8dbb49fadb 100644
--- a/icu4c/source/i18n/Makefile.in
+++ b/icu4c/source/i18n/Makefile.in
@@ -71,7 +71,7 @@ cpdtrans.o hextouni.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \
dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o nultrans.o \
remtrans.o titletrn.o tolowtrn.o toupptrn.o xformtrn.o \
name2uni.o uni2name.o unitohex.o nortrans.o unifilt.o quant.o transreg.o \
-llong.o nfrs.o nfrule.o nfsubs.o rbnf.o
+llong.o nfrs.o nfrule.o nfsubs.o rbnf.o upropset.o
diff --git a/icu4c/source/i18n/i18n.dsp b/icu4c/source/i18n/i18n.dsp
index bf394056b36..2e6055df047 100644
--- a/icu4c/source/i18n/i18n.dsp
+++ b/icu4c/source/i18n/i18n.dsp
@@ -370,6 +370,10 @@ SOURCE=.\unum.cpp
# End Source File
# Begin Source File
+SOURCE=.\upropset.cpp
+# End Source File
+# Begin Source File
+
SOURCE=.\usearch.cpp
# End Source File
# Begin Source File
diff --git a/icu4c/source/i18n/quant.cpp b/icu4c/source/i18n/quant.cpp
index 25074f87514..f90951127a9 100644
--- a/icu4c/source/i18n/quant.cpp
+++ b/icu4c/source/i18n/quant.cpp
@@ -46,9 +46,15 @@ UMatchDegree Quantifier::matches(const Replaceable& text,
int32_t start = offset;
uint32_t count = 0;
while (count < maxCount) {
+ int32_t pos = offset;
UMatchDegree m = matcher->matches(text, offset, limit, incremental);
if (m == U_MATCH) {
++count;
+ if (pos == offset) {
+ // If offset has not moved we have a zero-width match.
+ // Don't keep matching it infinitely.
+ break;
+ }
} else if (incremental && m == U_PARTIAL_MATCH) {
return U_PARTIAL_MATCH;
} else {
diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp
index fc575304d66..25b0fe9d4fa 100644
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@@ -41,8 +41,6 @@
#define SEGMENT_CLOSE ((UChar)0x0029) /*)*/
#define CONTEXT_ANTE ((UChar)0x007B) /*{*/
#define CONTEXT_POST ((UChar)0x007D) /*}*/
-#define SET_OPEN ((UChar)0x005B) /*[*/
-#define SET_CLOSE ((UChar)0x005D) /*]*/
#define CURSOR_POS ((UChar)0x007C) /*|*/
#define CURSOR_OFFSET ((UChar)0x0040) /*@*/
#define ANCHOR_START ((UChar)0x005E) /*^*/
@@ -50,6 +48,13 @@
#define ONE_OR_MORE ((UChar)0x002B) /*+*/
#define ZERO_OR_ONE ((UChar)0x003F) /*?*/
+#define DOT ((UChar)46) /*.*/
+
+static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]";
+ 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90,
+ 108, 58, 93, 92, 114, 92, 110, 36, 93, 0
+};
+
// By definition, the ANCHOR_END special character is a
// trailing SymbolTable.SYMBOL_REF character.
// private static final char ANCHOR_END = '$';
@@ -514,6 +519,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
// Text after a presumed end anchor is a syntax err
return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start);
}
+ if (UnicodeSet::resemblesPattern(rule, pos-1)) {
+ pp.setIndex(pos-1); // Backup to opening '['
+ buf.append(parser.parseSet(rule, pp));
+ if (U_FAILURE(parser.status)) {
+ return syntaxError(U_MALFORMED_SET, rule, start);
+ }
+ pos = pp.getIndex();
+ continue;
+ }
// Handle escapes
if (c == ESCAPE) {
if (pos == limit) {
@@ -653,14 +667,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
}
post = buf.length();
break;
- case SET_OPEN:
- pp.setIndex(pos-1); // Backup to opening '['
- buf.append(parser.parseSet(rule, pp));
- if (U_FAILURE(parser.status)) {
- return syntaxError(U_MALFORMED_SET, rule, start);
- }
- pos = pp.getIndex();
- break;
case CURSOR_POS:
if (cursor >= 0) {
return syntaxError(U_MULTIPLE_CURSORS, rule, start);
@@ -689,6 +695,9 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
}
}
break;
+ case DOT:
+ buf.append(parser.getDotStandIn());
+ break;
case KLEENE_STAR:
case ONE_OR_MORE:
case ZERO_OR_ONE:
@@ -749,7 +758,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) {
buf.append(parser.generateStandInFor(m));
}
break;
- // case SET_CLOSE:
default:
// Disallow unquoted characters other than [0-9A-Za-z]
// in the printable ASCII range. These characters are
@@ -892,6 +900,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rules,
}
parseData->data = data;
determineVariableRange(rules);
+ dotStandIn = (UChar) -1;
UnicodeString str; // scratch
idBlock.truncate(0);
@@ -1257,6 +1266,17 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) {
return variableNext++;
}
+/**
+ * Return the stand-in for the dot set. It is allocated the first
+ * time and reused thereafter.
+ */
+UChar TransliteratorParser::getDotStandIn() {
+ if (dotStandIn == (UChar) -1) {
+ dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET, status));
+ }
+ return dotStandIn;
+}
+
/**
* Append the value of the given variable name to the given
* UnicodeString.
diff --git a/icu4c/source/i18n/rbt_pars.h b/icu4c/source/i18n/rbt_pars.h
index e650435d30c..ffe81b5dd7e 100644
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@@ -108,6 +108,13 @@ class TransliteratorParser {
*/
UnicodeString undefinedVariableName;
+ /**
+ * The stand-in character for the 'dot' set, represented by '.' in
+ * patterns. This is allocated the first time it is needed, and
+ * reused thereafter.
+ */
+ UChar dotStandIn;
+
public:
/**
@@ -190,6 +197,12 @@ private:
*/
UChar generateStandInFor(UnicodeMatcher* adopted);
+ /**
+ * Return the stand-in for the dot set. It is allocated the first
+ * time and reused thereafter.
+ */
+ UChar getDotStandIn();
+
/**
* Append the value of the given variable name to the given
* UnicodeString.
diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp
index fc4d3f90a51..850ffae200e 100644
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@@ -17,6 +17,7 @@
#include "rbt_rule.h"
#include "umutex.h"
#include "ucln_in.h"
+#include "upropset.h"
// HIGH_VALUE > all valid values. 110000 for codepoints
#define UNICODESET_HIGH 0x0110000
@@ -42,49 +43,40 @@
#define UPPER_U ((UChar)0x0055) /*U*/
#define LOWER_U ((UChar)0x0075) /*u*/
-// N.B.: This mapping is different in ICU and Java
-//const UnicodeString UnicodeSet::CATEGORY_NAMES(
-// "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf", "");
-static const UChar CATEGORY_NAMES[] = {
- 0x43, 0x6E, /* "Cn" */
- 0x4C, 0x75, /* "Lu" */
- 0x4C, 0x6C, /* "Ll" */
- 0x4C, 0x74, /* "Lt" */
- 0x4C, 0x6D, /* "Lm" */
- 0x4C, 0x6F, /* "Lo" */
- 0x4D, 0x6E, /* "Mn" */
- 0x4D, 0x65, /* "Me" */
- 0x4D, 0x63, /* "Mc" */
- 0x4E, 0x64, /* "Nd" */
- 0x4E, 0x6C, /* "Nl" */
- 0x4E, 0x6F, /* "No" */
- 0x5A, 0x73, /* "Zs" */
- 0x5A, 0x6C, /* "Zl" */
- 0x5A, 0x70, /* "Zp" */
- 0x43, 0x63, /* "Cc" */
- 0x43, 0x66, /* "Cf" */
- 0x43, 0x6F, /* "Co" */
- 0x43, 0x73, /* "Cs" */
- 0x50, 0x64, /* "Pd" */
- 0x50, 0x73, /* "Ps" */
- 0x50, 0x65, /* "Pe" */
- 0x50, 0x63, /* "Pc" */
- 0x50, 0x6F, /* "Po" */
- 0x53, 0x6D, /* "Sm" */
- 0x53, 0x63, /* "Sc" */
- 0x53, 0x6B, /* "Sk" */
- 0x53, 0x6F, /* "So" */
- 0x50, 0x69, /* "Pi" */
- 0x50, 0x66, /* "Pf" */
- 0x00
-};
-
-/**
- * A cache mapping character category integers, as returned by
- * Unicode::getType(), to pairs strings. Entries are initially
- * zero length and are filled in on demand.
- */
-static UnicodeSet* CATEGORY_CACHE = NULL;
+//// TEMPORARY: Remove when deprecated category code constructor is removed.
+//static const UChar CATEGORY_NAMES[] = {
+// 0x43, 0x6E, /* "Cn" */
+// 0x4C, 0x75, /* "Lu" */
+// 0x4C, 0x6C, /* "Ll" */
+// 0x4C, 0x74, /* "Lt" */
+// 0x4C, 0x6D, /* "Lm" */
+// 0x4C, 0x6F, /* "Lo" */
+// 0x4D, 0x6E, /* "Mn" */
+// 0x4D, 0x65, /* "Me" */
+// 0x4D, 0x63, /* "Mc" */
+// 0x4E, 0x64, /* "Nd" */
+// 0x4E, 0x6C, /* "Nl" */
+// 0x4E, 0x6F, /* "No" */
+// 0x5A, 0x73, /* "Zs" */
+// 0x5A, 0x6C, /* "Zl" */
+// 0x5A, 0x70, /* "Zp" */
+// 0x43, 0x63, /* "Cc" */
+// 0x43, 0x66, /* "Cf" */
+// 0x43, 0x6F, /* "Co" */
+// 0x43, 0x73, /* "Cs" */
+// 0x50, 0x64, /* "Pd" */
+// 0x50, 0x73, /* "Ps" */
+// 0x50, 0x65, /* "Pe" */
+// 0x50, 0x63, /* "Pc" */
+// 0x50, 0x6F, /* "Po" */
+// 0x53, 0x6D, /* "Sm" */
+// 0x53, 0x63, /* "Sc" */
+// 0x53, 0x6B, /* "Sk" */
+// 0x53, 0x6F, /* "So" */
+// 0x50, 0x69, /* "Pi" */
+// 0x50, 0x66, /* "Pf" */
+// 0x00
+//};
/**
* Delimiter string used in patterns to close a category reference:
@@ -92,16 +84,12 @@ static UnicodeSet* CATEGORY_CACHE = NULL;
*/
static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
-
/**
* Cleanup function for transliterator component; delegates to
* Transliterator::cleanupRegistry().
*/
U_CFUNC UBool unicodeset_cleanup(void) {
- if (CATEGORY_CACHE) {
- delete []CATEGORY_CACHE;
- CATEGORY_CACHE = NULL;
- }
+ UnicodePropertySet::cleanup();
return TRUE;
}
@@ -174,24 +162,24 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
applyPattern(pattern, pos, &symbols, status);
}
-/**
- * Constructs a set from the given Unicode character category.
- * @param category an integer indicating the character category as
- * returned by Unicode::getType()
.
- */
-UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) :
- len(0), capacity(START_EXTRA), bufferCapacity(0), list(0),
- buffer(0)
-{
- if (U_SUCCESS(status)) {
- if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
- status = U_ILLEGAL_ARGUMENT_ERROR;
- } else {
- list = new UChar32[capacity];
- *this = getCategorySet(category);
- }
- }
-}
+///**
+// * Constructs a set from the given Unicode character category.
+// * @param category an integer indicating the character category as
+// * returned by Unicode::getType()
.
+// */
+//UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) :
+// len(0), capacity(START_EXTRA), bufferCapacity(0), list(0),
+// buffer(0)
+//{
+// if (U_SUCCESS(status)) {
+// if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) {
+// status = U_ILLEGAL_ARGUMENT_ERROR;
+// } else {
+// list = new UChar32[capacity];
+// *this = getCategorySet(category);
+// }
+// }
+//}
/**
* Constructs a set that is identical to the given UnicodeSet.
@@ -319,6 +307,16 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern,
}
}
+/**
+ * Return true if the given position, in the given pattern, appears
+ * to be the start of a UnicodeSet pattern.
+ */
+UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
+ return ((pos+1) < pattern.length() &&
+ pattern.charAt(pos) == (UChar)91/*[*/) ||
+ UnicodePropertySet::resemblesPattern(pattern, pos);
+}
+
/**
* Append the toPattern()
representation of a
* character to the given StringBuffer
.
@@ -339,6 +337,8 @@ void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool useHexEscape)
case COMPLEMENT:
case INTERSECTION:
case BACKSLASH:
+ case 123/*{*/:
+ case 125/*}*/:
buf.append(BACKSLASH);
break;
default:
@@ -451,15 +451,15 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
UBool escapeUnprintable) const {
result.append(SET_OPEN);
- // Check against the predefined categories. We implicitly build
- // up ALL category sets the first time toPattern() is called.
- for (int8_t cat=0; cat 1 &&
- catName.charAt(0) == COMPLEMENT);
- if (invert) {
- cat.remove(0, 1);
- }
-
- UBool match = FALSE;
-
- // if we have two characters, search the category map for that
- // code and either construct and return a UnicodeSet from the
- // data in the category map or throw an exception
- if (cat.length() == 2) {
- int32_t i = 0;
- int32_t numCategories = Unicode::GENERAL_TYPES_COUNT * 2;
-
- while (i < numCategories)
- {
- if (CATEGORY_NAMES[i] == cat.charAt(0)
- && CATEGORY_NAMES[i+1] == cat.charAt(1))
- {
- *this = getCategorySet((int8_t)(i/2));
- match = TRUE;
- break;
- }
- i += 2;
- }
- } else if (cat.length() == 1) {
- // if we have one character, search the category map for
- // codes beginning with that letter, and union together
- // all of the matching sets that we find (or throw an
- // exception if there are no matches)
- clear();
- for (int32_t i=0; i= 0) {
- add((UChar32) start, (UChar32) end);
- }
- start = end = i;
- }
- }
- }
- if (start >= 0) {
- add((UChar32) start, (UChar32) end);
- }
- }
- }
-
- if (!match) {
- status = U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
-
- if (invert) {
- complement();
- }
-}
-
-/**
- * Returns a pairs string for the given category. This string is
- * cached and returned again if this method is called again with
- * the same parameter.
- */
-const UnicodeSet& UnicodeSet::getCategorySet(int8_t cat) {
- // In order to tell what cache entries are empty, we assume
- // every category specifies at least one character. Thus
- // sets in the cache that are empty are uninitialized.
- if (CATEGORY_CACHE == NULL) {
- umtx_lock(NULL);
- if (CATEGORY_CACHE == NULL) {
- CATEGORY_CACHE = new UnicodeSet[Unicode::GENERAL_TYPES_COUNT];
- ucln_i18n_registerCleanup();
- }
- umtx_unlock(NULL);
- }
- if (CATEGORY_CACHE[cat].isEmpty()) {
- // Walk through all Unicode characters, noting the start
- // and end of each range for which Character.getType(c)
- // returns the given category integer. Since we are
- // iterating in order, we can simply append the resulting
- // ranges to the pairs string.
- UnicodeSet& set = CATEGORY_CACHE[cat];
- int32_t start = -1;
- int32_t end = -2;
- // N.B.: There seems to be a bug that deadlocks if you
- // call getType() with a supplemental character right now.
- // TODO: Change 0xFFFF to MAX_VALUE later.
- for (int32_t i=MIN_VALUE; i<=0xFFFF/*TEMPORARY*/; ++i) {
- if (Unicode::getType((UChar)i) == cat) {
- if ((end+1) == i) {
- end = i;
- } else {
- if (start >= 0) {
- set.add((UChar32)start, (UChar32)end);
- }
- start = end = i;
- }
- }
- }
- if (start >= 0) {
- set.add((UChar32)start, (UChar32)end);
- }
- }
- return CATEGORY_CACHE[cat];
-}
-
//----------------------------------------------------------------
// Implementation: Utility methods
//----------------------------------------------------------------
diff --git a/icu4c/source/i18n/upropset.cpp b/icu4c/source/i18n/upropset.cpp
new file mode 100644
index 00000000000..78e6f3f1528
--- /dev/null
+++ b/icu4c/source/i18n/upropset.cpp
@@ -0,0 +1,597 @@
+/*
+**********************************************************************
+* Copyright (c) 2001, International Business Machines
+* Corporation and others. All Rights Reserved.
+**********************************************************************
+* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.cpp,v $
+* $Date: 2001/10/17 19:20:41 $
+* $Revision: 1.1 $
+**********************************************************************
+*/
+#include "upropset.h"
+#include "ustrfmt.h"
+#include "unicode/unistr.h"
+#include "unicode/uscript.h"
+#include "unicode/uniset.h"
+#include "unicode/parsepos.h"
+#include "hash.h"
+
+U_NAMESPACE_BEGIN
+
+static Hashtable* NAME_MAP = NULL;
+
+static Hashtable* CATEGORY_MAP = NULL;
+
+/**
+ * A cache mapping character category integers, as returned by
+ * UCharacter.getType(), to sets. Entries are initially
+ * null and are created on demand.
+ */
+static UnicodeSet* CATEGORY_CACHE = NULL;
+
+/**
+ * A cache mapping script integers, as defined by
+ * UScript, to sets. Entries are initially
+ * null and are created on demand.
+ */
+static UnicodeSet* SCRIPT_CACHE = NULL;
+
+// Special value codes
+static const int32_t ANY = -1; // general category: all code points
+
+//----------------------------------------------------------------
+// Unicode string and character constants
+//----------------------------------------------------------------
+
+static const UChar POSIX_OPEN[] = { 91,58,0 }; // "[:"
+static const UChar POSIX_CLOSE[] = { 58,93,0 }; // ":]"
+
+static const UChar PERL_OPEN[] = { 92,112,0 }; // "\\p"
+static const UChar PERL_CLOSE[] = { 125,0 }; // "}"
+
+static const UChar HAT = 0x005E; /*^*/
+static const UChar UPPER_P = 0x0050; /*P*/
+static const UChar LEFT_BRACE = 0x007B; /*{*/
+static const UChar EQUALS = 0x003D; /*=*/
+
+//----------------------------------------------------------------------
+// class _CharString
+// An identical class named CharString can be found in transreg.cpp.
+// If we find ourselves needing another copy of this utility class we
+// should probably pull it out into putil or some such place.
+//----------------------------------------------------------------------
+
+class _CharString {
+ public:
+ _CharString(const UnicodeString& str);
+ ~_CharString();
+ operator char*() { return ptr; }
+ private:
+ char buf[128];
+ char* ptr;
+};
+
+_CharString::_CharString(const UnicodeString& str) {
+ if (str.length() >= (int32_t)sizeof(buf)) {
+ ptr = new char[str.length() + 8];
+ } else {
+ ptr = buf;
+ }
+ str.extract(0, 0x7FFFFFFF, ptr, "");
+}
+
+_CharString::~_CharString() {
+ if (ptr != buf) {
+ delete[] ptr;
+ }
+}
+
+//----------------------------------------------------------------
+// Public API
+//----------------------------------------------------------------
+
+/**
+ * Return true if the given position, in the given pattern, appears
+ * to be the start of a property set pattern [:foo:], \p{foo}, or
+ * \P{foo}.
+ */
+UBool UnicodePropertySet::resemblesPattern(const UnicodeString& pattern,
+ int32_t pos) {
+ // Patterns are at least 5 characters long
+ if ((pos+5) > pattern.length()) {
+ return FALSE;
+ }
+
+ // Look for an opening [:, [:^, \p, or \P
+ return (0 == pattern.compare(pos, 2, POSIX_OPEN)) ||
+ (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT));
+}
+
+/**
+ * Create a UnicodeSet by parsing the given pattern at the given
+ * parse position.
+ *
+ * @param pattern the pattern string
+ * @param ppos on entry, the position at which to begin parsing.
+ * This shold be one of the locations marked '^':
+ *
+ * [:blah:] \p{blah} \P{blah}
+ * ^ % ^ % ^ %
+ *
+ * On return, the position after the last character parsed, that is,
+ * the locations marked '%'. If the parse fails, ppos is returned
+ * unchanged.
+ * @return a newly-constructed UnicodeSet object, or null upon
+ * failure.
+ */
+UnicodeSet* UnicodePropertySet::createFromPattern(const UnicodeString& pattern,
+ ParsePosition& ppos) {
+ init();
+
+ UnicodeSet* set = NULL;
+
+ int32_t pos = ppos.getIndex();
+
+ // On entry, ppos should point to one of the following locations:
+
+ // Minimum length is 5 characters, e.g. \p{L}
+ if ((pos+5) > pattern.length()) {
+ return NULL;
+ }
+
+ UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat}
+ UBool invert = FALSE;
+
+ // Look for an opening [:, [:^, \p, or \P
+ if (0 == pattern.compare(pos, 2, POSIX_OPEN)) {
+ posix = TRUE;
+ pos = skipWhitespace(pattern, pos+2);
+ if (pos < pattern.length() && pattern.charAt(pos) == HAT) {
+ ++pos;
+ invert = TRUE;
+ }
+ } else if (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT)) {
+ invert = (pattern.charAt(pos+1) == UPPER_P);
+ pos = skipWhitespace(pattern, pos+2);
+ if (pos == pattern.length() || pattern.charAt(pos++) != LEFT_BRACE) {
+ // Syntax error; "\p" or "\P" not followed by "{"
+ return NULL;
+ }
+ } else {
+ // Open delimiter not seen
+ return NULL;
+ }
+
+ // Look for the matching close delimiter, either :] or }
+ int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
+ if (close < 0) {
+ // Syntax error; close delimiter missing
+ return NULL;
+ }
+
+ // Look for an '=' sign. If this is present, we will parse a
+ // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
+ // pattern.
+ int32_t equals = pattern.indexOf(EQUALS, pos);
+ if (equals >= 0 && equals < close) {
+ // Equals seen; parse medium/long pattern
+ UnicodeString typeName = munge(pattern, pos, equals);
+ UnicodeString valueName = munge(pattern, equals+1, close);
+ SetFactory factory;
+ factory = voidPtrToSetFactory(NAME_MAP->get(typeName));
+ if (factory == NULL) {
+ // Syntax error; type name not recognized
+ return NULL;
+ }
+ set = (*factory)(valueName);
+ } else {
+ // No equals seen; parse short format \p{Cf}
+ UnicodeString shortName = munge(pattern, pos, close);
+
+ // First try general category
+ set = createCategorySet(shortName);
+
+ // If this fails, try script
+ if (set == NULL) {
+ set = createScriptSet(shortName);
+ }
+ }
+
+ // Upon failure, return NULL with ppos unchanged
+ if (set == NULL) {
+ return NULL;
+ }
+
+ if (invert) {
+ set->complement();
+ }
+
+ // Move to the limit position after the close delimiter
+ ppos.setIndex(close + (posix ? 2 : 1));
+
+ return set;
+}
+
+//----------------------------------------------------------------
+// Property set factory static methods
+// NOTE: This will change/go away when we implement UCharacter
+// based property retrieval.
+//----------------------------------------------------------------
+
+static UBool _numericValueFilter(UChar32 c, void* context) {
+ int32_t value = * (int32_t*) context;
+ // TODO: Change this to a more generic function, like
+ // u_charNumericValue (when one exists).
+ return u_charDigitValue(c) == value;
+}
+
+UnicodeSet* UnicodePropertySet::createNumericValueSet(const UnicodeString& valueName) {
+ _CharString cvalueName(valueName);
+ UnicodeSet* set = new UnicodeSet();
+ char* end;
+ double value = uprv_strtod(cvalueName, &end);
+ int32_t ivalue = (int32_t) value;
+ if (ivalue != value || ivalue < 0 || *end != 0) {
+ // UCharacter doesn't support negative or non-integral
+ // values, so just return an empty set
+ return set;
+ }
+ initSetFromFilter(*set, _numericValueFilter, &ivalue);
+ return set;
+}
+
+/**
+ * Given a general category value name, create a corresponding
+ * set and return it, or return null if the name is invalid.
+ * @param valueName a pre-munged general category value name
+ */
+UnicodeSet* UnicodePropertySet::createCategorySet(const UnicodeString& valueName) {
+ int32_t valueCode = CATEGORY_MAP->geti(valueName);
+ if (valueCode == 0) {
+ return NULL;
+ }
+
+ UnicodeSet* set = new UnicodeSet();
+ if (valueCode == ANY) {
+ set->complement();
+ return set;
+ }
+ for (int32_t cat=0; cataddAll(getCategorySet(cat));
+ }
+ }
+ return set;
+}
+
+/**
+ * Given a script value name, create a corresponding set and
+ * return it, or return null if the name is invalid.
+ * @param valueName a pre-munged script value name
+ */
+UnicodeSet* UnicodePropertySet::createScriptSet(const UnicodeString& valueName) {
+ _CharString cvalueName(valueName);
+ UErrorCode ec = U_ZERO_ERROR;
+ UScriptCode script = uscript_getCode(cvalueName, &ec);
+ if (script == USCRIPT_INVALID_CODE || U_FAILURE(ec)) {
+ // Syntax error; unknown short name
+ return NULL;
+ }
+ return new UnicodeSet(getScriptSet(script));
+}
+
+//----------------------------------------------------------------
+// Utility methods
+//----------------------------------------------------------------
+
+static UBool _categoryFilter(UChar32 c, void* context) {
+ int32_t value = * (int32_t*) context;
+ return u_charType(c) == value;
+}
+
+/**
+ * Returns a UnicodeSet for the given category. This set is
+ * cached and returned again if this method is called again with
+ * the same parameter.
+ *
+ * Callers MUST NOT MODIFY the returned set.
+ */
+const UnicodeSet& UnicodePropertySet::getCategorySet(int32_t cat) {
+ if (CATEGORY_CACHE[cat].isEmpty()) {
+ initSetFromFilter(CATEGORY_CACHE[cat], _categoryFilter, &cat);
+ }
+ return CATEGORY_CACHE[cat];
+}
+
+static UBool _scriptFilter(UChar32 c, void* context) {
+ UScriptCode value = * (UScriptCode*) context;
+ UErrorCode ec = U_ZERO_ERROR;
+ return uscript_getScript(c, &ec) == value;
+}
+
+/**
+ * Returns a UnicodeSet for the given script. This set is
+ * cached and returned again if this method is called again with
+ * the same parameter.
+ *
+ * Callers MUST NOT MODIFY the returned set.
+ */
+const UnicodeSet& UnicodePropertySet::getScriptSet(UScriptCode script) {
+ if (SCRIPT_CACHE[script].isEmpty()) {
+ initSetFromFilter(CATEGORY_CACHE[script], _scriptFilter, &script);
+ }
+ return SCRIPT_CACHE[script];
+}
+
+/**
+ * Given a string, munge it to lost the whitespace. So "General
+ * Category " becomes "GeneralCategory". We munge all type and value
+ * strings, and store all type and value keys pre-munged. NOTE:
+ * Unlike the Java version, we do not modify the case, since we use a
+ * case-insensitive compare function.
+ */
+UnicodeString UnicodePropertySet::munge(const UnicodeString& str,
+ int32_t start, int32_t limit) {
+ UnicodeString buf;
+ for (int32_t i=start; i= 0) {
+ set.add((UChar32)start, (UChar32)end);
+ }
+ start = end = i;
+ }
+ }
+ }
+ if (start >= 0) {
+ set.add((UChar32)start, (UChar32)end);
+ }
+}
+
+//----------------------------------------------------------------
+// Type and value name maps
+//----------------------------------------------------------------
+
+/**
+ * Add a type mapping to the name map.
+ */
+void UnicodePropertySet::addType(const UnicodeString& shortName,
+ const UnicodeString& longName,
+ SetFactory factory) {
+ UErrorCode ec = U_ZERO_ERROR;
+ void* p = setFactoryToVoidPtr(factory);
+ NAME_MAP->put(shortName, p, ec);
+ NAME_MAP->put(longName, p, ec);
+}
+
+/**
+ * Add a value mapping to the name map.
+ */
+void UnicodePropertySet::addValue(Hashtable* map,
+ const UnicodeString& shortName,
+ const UnicodeString& longName,
+ int32_t value) {
+ // assert(value != 0);
+ UErrorCode ec = U_ZERO_ERROR;
+ map->puti(shortName, value, ec);
+ if (longName.length() != 0) {
+ map->puti(longName, value, ec);
+ }
+}
+
+void UnicodePropertySet::init() {
+ if (NAME_MAP != NULL) {
+ return;
+ }
+
+ NAME_MAP = new Hashtable(TRUE);
+ CATEGORY_MAP = new Hashtable(TRUE);
+ CATEGORY_CACHE = new UnicodeSet[U_CHAR_CATEGORY_COUNT];
+ SCRIPT_CACHE = new UnicodeSet[USCRIPT_CODE_LIMIT];
+
+ // NOTE: We munge all search keys to have no whitespace
+ // and upper case. As such, all stored keys should have
+ // this format.
+
+ // Load the map with type data
+
+ addType("GC", "GENERALCATEGORY", createCategorySet);
+
+ //addType("CC", "COMBININGCLASS", COMBINING_CLASS);
+ //addType("BC", "BIDICLASS", BIDI_CLASS);
+ //addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE);
+
+ addType("NV", "NUMERICVALUE", createNumericValueSet);
+
+ //addType("NT", "NUMERICTYPE", NUMERIC_TYPE);
+ //addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH);
+ //addType("LB", "LINEBREAK", LINE_BREAK);
+ //addType("JT", "JOININGTYPE", JOINING_TYPE);
+
+ addType("SC", "SCRIPT", createScriptSet);
+
+ // Load the map with value data
+
+ // General Category
+
+ addValue(CATEGORY_MAP, "ANY", "", ANY); // special case
+
+ addValue(CATEGORY_MAP, "C", "OTHER",
+ (1 << U_CONTROL_CHAR) |
+ (1 << U_FORMAT_CHAR) |
+ (1 << U_GENERAL_OTHER_TYPES) |
+ (1 << U_PRIVATE_USE_CHAR) |
+ (1 << U_SURROGATE));
+
+ addValue(CATEGORY_MAP, "CC", "CONTROL",
+ 1 << U_CONTROL_CHAR);
+ addValue(CATEGORY_MAP, "CF", "FORMAT",
+ 1 << U_FORMAT_CHAR);
+ addValue(CATEGORY_MAP, "CN", "UNASSIGNED",
+ 1 << U_GENERAL_OTHER_TYPES);
+ addValue(CATEGORY_MAP, "CO", "PRIVATEUSE",
+ 1 << U_PRIVATE_USE_CHAR);
+ addValue(CATEGORY_MAP, "CS", "SURROGATE",
+ 1 << U_SURROGATE);
+
+ addValue(CATEGORY_MAP, "L", "LETTER",
+ (1 << U_LOWERCASE_LETTER) |
+ (1 << U_MODIFIER_LETTER) |
+ (1 << U_OTHER_LETTER) |
+ (1 << U_TITLECASE_LETTER) |
+ (1 << U_UPPERCASE_LETTER));
+
+ addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER",
+ 1 << U_LOWERCASE_LETTER);
+ addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER",
+ 1 << U_MODIFIER_LETTER);
+ addValue(CATEGORY_MAP, "LO", "OTHERLETTER",
+ 1 << U_OTHER_LETTER);
+ addValue(CATEGORY_MAP, "LT", "TITLECASELETTER",
+ 1 << U_TITLECASE_LETTER);
+ addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER",
+ 1 << U_UPPERCASE_LETTER);
+
+ addValue(CATEGORY_MAP, "M", "MARK",
+ (1 << U_NON_SPACING_MARK) |
+ (1 << U_COMBINING_SPACING_MARK) |
+ (1 << U_ENCLOSING_MARK));
+
+ addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK",
+ 1 << U_NON_SPACING_MARK);
+ addValue(CATEGORY_MAP, "MC", "SPACINGMARK",
+ 1 << U_COMBINING_SPACING_MARK);
+ addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK",
+ 1 << U_ENCLOSING_MARK);
+
+ addValue(CATEGORY_MAP, "N", "NUMBER",
+ (1 << U_DECIMAL_DIGIT_NUMBER) |
+ (1 << U_LETTER_NUMBER) |
+ (1 << U_OTHER_NUMBER));
+
+ addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER",
+ 1 << U_DECIMAL_DIGIT_NUMBER);
+ addValue(CATEGORY_MAP, "NL", "LETTERNUMBER",
+ 1 << U_LETTER_NUMBER);
+ addValue(CATEGORY_MAP, "NO", "OTHERNUMBER",
+ 1 << U_OTHER_NUMBER);
+
+ addValue(CATEGORY_MAP, "P", "PUNCTUATION",
+ (1 << U_CONNECTOR_PUNCTUATION) |
+ (1 << U_DASH_PUNCTUATION) |
+ (1 << U_END_PUNCTUATION) |
+ (1 << U_FINAL_PUNCTUATION) |
+ (1 << U_INITIAL_PUNCTUATION) |
+ (1 << U_OTHER_PUNCTUATION) |
+ (1 << U_START_PUNCTUATION));
+
+ addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION",
+ 1 << U_CONNECTOR_PUNCTUATION);
+ addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION",
+ 1 << U_DASH_PUNCTUATION);
+ addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION",
+ 1 << U_END_PUNCTUATION);
+ addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION",
+ 1 << U_FINAL_PUNCTUATION);
+ addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION",
+ 1 << U_INITIAL_PUNCTUATION);
+ addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION",
+ 1 << U_OTHER_PUNCTUATION);
+ addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION",
+ 1 << U_START_PUNCTUATION);
+
+ addValue(CATEGORY_MAP, "S", "SYMBOL",
+ (1 << U_CURRENCY_SYMBOL) |
+ (1 << U_MODIFIER_SYMBOL) |
+ (1 << U_MATH_SYMBOL) |
+ (1 << U_OTHER_SYMBOL));
+
+ addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL",
+ 1 << U_CURRENCY_SYMBOL);
+ addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL",
+ 1 << U_MODIFIER_SYMBOL);
+ addValue(CATEGORY_MAP, "SM", "MATHSYMBOL",
+ 1 << U_MATH_SYMBOL);
+ addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL",
+ 1 << U_OTHER_SYMBOL);
+
+ addValue(CATEGORY_MAP, "Z", "SEPARATOR",
+ (1 << U_LINE_SEPARATOR) |
+ (1 << U_PARAGRAPH_SEPARATOR) |
+ (1 << U_SPACE_SEPARATOR));
+
+ addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR",
+ 1 << U_LINE_SEPARATOR);
+ addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR",
+ 1 << U_PARAGRAPH_SEPARATOR);
+ addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR",
+ 1 << U_SPACE_SEPARATOR);
+}
+
+void UnicodePropertySet::cleanup() {
+ if (NAME_MAP != NULL) {
+ delete NAME_MAP; NAME_MAP = NULL;
+ delete CATEGORY_MAP; CATEGORY_MAP = NULL;
+ delete[] CATEGORY_CACHE; CATEGORY_CACHE = NULL;
+ delete[] SCRIPT_CACHE; SCRIPT_CACHE = NULL;
+ }
+}
+
+U_NAMESPACE_END
+
+//eof
diff --git a/icu4c/source/i18n/upropset.h b/icu4c/source/i18n/upropset.h
new file mode 100644
index 00000000000..f4bacb5907f
--- /dev/null
+++ b/icu4c/source/i18n/upropset.h
@@ -0,0 +1,240 @@
+/*
+**********************************************************************
+* Copyright (c) 2001, International Business Machines
+* Corporation and others. All Rights Reserved.
+**********************************************************************
+* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.h,v $
+* $Date: 2001/10/17 19:20:41 $
+* $Revision: 1.1 $
+**********************************************************************
+*/
+#ifndef _UPROPSET_H_
+#define _UPROPSET_H_
+
+#include "unicode/utypes.h"
+#include "unicode/uscript.h"
+
+U_NAMESPACE_BEGIN
+
+class UnicodeString;
+class UnicodeSet;
+class ParsePosition;
+class Hashtable;
+
+/**
+ * INTERNAL CLASS implementing the UnicodeSet properties as outlined
+ * at:
+ *
+ * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html
+ *
+ * Recognized syntax:
+ *
+ * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
+ * \p{foo} \P{foo} - white space not allowed within "\p" or "\P"
+ *
+ * Other than the above restrictions, white space is ignored. Case
+ * is ignored except in "\p" and "\P".
+ *
+ * This class cannot be instantiated. It has a public static method,
+ * createPropertySet(), with takes a pattern to be parsed and returns
+ * a new UnicodeSet. Another public static method,
+ * resemblesPattern(), returns true if a given pattern string appears
+ * to be a property set pattern, and therefore should be passed in to
+ * createPropertySet().
+ *
+ * NOTE: Current implementation is incomplete. The following list
+ * indicates which properties are supported.
+ *
+ * + GeneralCategory
+ * CombiningClass
+ * BidiClass
+ * DecompositionType
+ * + NumericValue
+ * NumericType
+ * EastAsianWidth
+ * LineBreak
+ * JoiningType
+ * + Script
+ *
+ * '+' indicates a supported property.
+ *
+ * @author Alan Liu
+ * @version $RCSfile: upropset.h,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:20:41 $
+ */
+class UnicodePropertySet {
+
+ public:
+
+ //----------------------------------------------------------------
+ // Public API
+ //----------------------------------------------------------------
+
+ /**
+ * Return true if the given position, in the given pattern, appears
+ * to be the start of a property set pattern [:foo:], \p{foo}, or
+ * \P{foo}.
+ */
+ static UBool resemblesPattern(const UnicodeString& pattern, int32_t pos);
+
+ /**
+ * Create a UnicodeSet by parsing the given pattern at the given
+ * parse position.
+ *
+ * @param pattern the pattern string
+ * @param ppos on entry, the position at which to begin parsing.
+ * This shold be one of the locations marked '^':
+ *
+ * [:blah:] \p{blah} \P{blah}
+ * ^ % ^ % ^ %
+ *
+ * On return, the position after the last character parsed, that is,
+ * the locations marked '%'. If the parse fails, ppos is returned
+ * unchanged.
+ * @return a newly-constructed UnicodeSet object, or null upon
+ * failure.
+ */
+ static UnicodeSet* createFromPattern(const UnicodeString& pattern,
+ ParsePosition& ppos);
+
+ private:
+
+ //----------------------------------------------------------------
+ // Property set factory static methods
+ // NOTE: This will change/go away when we implement UCharacter
+ // based property retrieval.
+ //----------------------------------------------------------------
+
+ typedef UnicodeSet* (*SetFactory)(const UnicodeString& valueName);
+
+ static UnicodeSet* createNumericValueSet(const UnicodeString& valueName);
+
+ /**
+ * Given a general category value name, create a corresponding
+ * set and return it, or return null if the name is invalid.
+ * @param valueName a pre-munged general category value name
+ */
+ static UnicodeSet* createCategorySet(const UnicodeString& valueName);
+
+ /**
+ * Given a script value name, create a corresponding set and
+ * return it, or return null if the name is invalid.
+ * @param valueName a pre-munged script value name
+ */
+ static UnicodeSet* createScriptSet(const UnicodeString& valueName);
+
+ //----------------------------------------------------------------
+ // Utility methods
+ //----------------------------------------------------------------
+
+ /**
+ * Returns a UnicodeSet for the given category. This set is
+ * cached and returned again if this method is called again with
+ * the same parameter.
+ *
+ * Callers MUST NOT MODIFY the returned set.
+ */
+ static const UnicodeSet& getCategorySet(int32_t cat);
+
+ /**
+ * Returns a UnicodeSet for the given script. This set is
+ * cached and returned again if this method is called again with
+ * the same parameter.
+ *
+ * Callers MUST NOT MODIFY the returned set.
+ */
+ static const UnicodeSet& getScriptSet(UScriptCode script);
+
+ /**
+ * Given a string, munge it to upper case and lose the whitespace.
+ * So "General Category " becomes "GENERALCATEGORY". We munge all
+ * type and value strings, and store all type and value keys
+ * pre-munged.
+ */
+ static UnicodeString munge(const UnicodeString& str, int32_t start, int32_t limit);
+
+ /**
+ * Skip over a sequence of zero or more white space characters
+ * at pos. Return the index of the first non-white-space character
+ * at or after pos, or str.length(), if there is none.
+ */
+ static int32_t skipWhitespace(const UnicodeString& str, int32_t pos);
+
+ //----------------------------------------------------------------
+ // Generic filter-based scanning code
+ //
+ // NOTE: In general, we don't want to do this! This is a temporary
+ // implementation until we have time for something that examines
+ // the underlying UCharacter data structures in an intelligent
+ // way. Iterating over all code points is dumb. What we want to
+ // do, for instance, is iterate over internally-stored ranges
+ // of characters that have a given property.
+ //----------------------------------------------------------------
+
+ /**
+ * A filter that returns TRUE if the given code point should be
+ * included in the UnicodeSet being constructed.
+ */
+ typedef UBool (*Filter)(UChar32 codePoint, void* context);
+
+ /**
+ * Set the given UnicodeSet to contain all code points for which
+ * filter returns TRUE. The context parameter is passed unchanged
+ * to the filter function.
+ */
+ static void initSetFromFilter(UnicodeSet& set, Filter filter,
+ void* context);
+
+ //----------------------------------------------------------------
+ // Type and value name maps
+ //----------------------------------------------------------------
+
+ /**
+ * Add a type mapping to the name map.
+ */
+ static void addType(const UnicodeString& shortName,
+ const UnicodeString& longName,
+ SetFactory factory);
+
+ /**
+ * Add a value mapping to the name map.
+ */
+ static void addValue(Hashtable* map,
+ const UnicodeString& shortName,
+ const UnicodeString& longName,
+ int32_t value);
+
+ static void init();
+
+ public:
+ static void cleanup();
+
+ private:
+ //----------------------------------------------------------------
+ // SetFactory <=> void*
+ // I don't know why the compiler won't cast between these types.
+ // They should be interconvertible. Does C++ distinguish between
+ // pointers into code and pointers into data? In any case, we
+ // convert between these types in a safe way here.
+ //----------------------------------------------------------------
+
+ union SetFactoryTok {
+ void* voidPointer;
+ SetFactory functionPointer;
+ };
+
+ inline static void* setFactoryToVoidPtr(SetFactory f) {
+ SetFactoryTok tok;
+ tok.functionPointer = f;
+ return tok.voidPointer;
+ }
+
+ inline static SetFactory voidPtrToSetFactory(void* p) {
+ SetFactoryTok tok;
+ tok.voidPointer = p;
+ return tok.functionPointer;
+ }
+};
+
+U_NAMESPACE_END
+
+#endif
diff --git a/icu4c/source/test/intltest/transtst.cpp b/icu4c/source/test/intltest/transtst.cpp
index b2d9334816e..d1a863cbb16 100644
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@@ -124,10 +124,11 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE(42,TestUndefinedVariable);
TESTCASE(43,TestEmptyContext);
TESTCASE(44,TestCompoundFilterID);
- TESTCASE(45,TestDevanagariLatinRT);
- TESTCASE(46,TestTeluguLatinRT);
- TESTCASE(47,TestCompoundLatinRT);
- TESTCASE(48,TestSanskritLatinRT);
+ TESTCASE(45,TestPropertySet);
+ TESTCASE(46,TestDevanagariLatinRT);
+ TESTCASE(47,TestTeluguLatinRT);
+ TESTCASE(48,TestCompoundLatinRT);
+ TESTCASE(49,TestSanskritLatinRT);
default: name = ""; break;
}
}
@@ -2066,6 +2067,15 @@ void TransliteratorTest::TestCompoundFilterID(void) {
}
}
+/**
+ * Test new property set syntax
+ */
+void TransliteratorTest::TestPropertySet() {
+ expect("a>A; \\p{Lu}>x; \\p{ANY}>y;", "abcDEF", "Ayyxxx");
+ expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9",
+ "[ a stitch ]\n[ in time ]\r[ saves 9]");
+}
+
//======================================================================
// Ram's tests
//======================================================================
diff --git a/icu4c/source/test/intltest/transtst.h b/icu4c/source/test/intltest/transtst.h
index 5f888d6cac8..e22a907cdbf 100644
--- a/icu4c/source/test/intltest/transtst.h
+++ b/icu4c/source/test/intltest/transtst.h
@@ -210,6 +210,11 @@ class TransliteratorTest : public IntlTest {
*/
void TestCompoundFilterID(void);
+ /**
+ * Test new property set syntax
+ */
+ void TestPropertySet(void);
+
/* Devanagari-Latin rules Test */
void TestDevanagariLatinRT(void);
diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp
index 8b450e67121..8970b9b7335 100644
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@@ -41,7 +41,8 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
CASE(4,TestMinimalRep);
CASE(5,TestAPI);
CASE(6,TestScriptSet);
- CASE(7,TestExhaustive);
+ CASE(7,TestPropertySet);
+ CASE(8,TestExhaustive);
default: name = ""; break;
}
}
@@ -103,16 +104,18 @@ UnicodeSetTest::TestCategories(void) {
void
UnicodeSetTest::TestCloneEqualHash(void) {
UErrorCode status = U_ZERO_ERROR;
- int8_t category=Unicode::LOWERCASE_LETTER;
- UnicodeSet *set1=new UnicodeSet(category, status); // :Li: Letter, lowercase
+ //int8_t category=Unicode::LOWERCASE_LETTER;
+ //UnicodeSet *set1=new UnicodeSet(category, status); // :Li: Letter, lowercase
+ UnicodeSet *set1=new UnicodeSet("[:Ll:]", status); // Letter, lowercase
if (U_FAILURE(status)){
- errln((UnicodeString)"FAIL: Can't construst set with cateegory->Ll");
+ errln((UnicodeString)"FAIL: Can't construst set with category->Ll");
return;
}
- category=Unicode::DECIMAL_DIGIT_NUMBER;
- UnicodeSet *set2=new UnicodeSet(category, status); //Number, Decimal digit
+ //category=Unicode::DECIMAL_DIGIT_NUMBER;
+ //UnicodeSet *set2=new UnicodeSet(category, status); //Number, Decimal digit
+ UnicodeSet *set2=new UnicodeSet("[:Nd:]", status); //Number, Decimal digit
if (U_FAILURE(status)){
- errln((UnicodeString)"FAIL: Can't construct set with cateegory->Nd");
+ errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
return;
}
@@ -407,6 +410,22 @@ void UnicodeSetTest::TestScriptSet() {
expectContainment(set2, "[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA");
}
+/**
+ * Test the [:Latin:] syntax.
+ */
+void UnicodeSetTest::TestPropertySet() {
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeSet set("[:Latin:]", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
+ expectContainment(set, "aA", CharsToUnicodeString("\\u0391\\u03B1"));
+ set.applyPattern("[\\p{Greek}]", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
+ expectContainment(set, CharsToUnicodeString("\\u0391\\u03B1"), "aA");
+ set.applyPattern("\\P{ GENERAL Category = upper case letter }", status);
+ if (U_FAILURE(status)) { errln("FAIL"); return; }
+ expectContainment(set, "abc", "ABC");
+}
+
void UnicodeSetTest::TestExhaustive() {
// exhaustive tests. Simulate UnicodeSets with integers.
// That gives us very solid tests (except for large memory tests).
@@ -569,6 +588,15 @@ UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
return pairs;
}
+void
+UnicodeSetTest::expectContainment(const UnicodeSet& set,
+ const UnicodeString& charsIn,
+ const UnicodeString& charsOut) {
+ UnicodeString pat;
+ set.toPattern(pat);
+ expectContainment(set, pat, charsIn, charsOut);
+}
+
void
UnicodeSetTest::expectContainment(const UnicodeSet& set,
const UnicodeString& setName,
diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h
index 71b1aee927d..bac8c41fa09 100644
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@@ -41,6 +41,11 @@ private:
void TestScriptSet(void);
+ /**
+ * Test the [:Latin:] syntax.
+ */
+ void TestPropertySet(void);
+
void TestExhaustive(void);
private:
@@ -79,6 +84,9 @@ private:
*/
static UnicodeString getPairs(const UnicodeSet& set);
+ void expectContainment(const UnicodeSet& set,
+ const UnicodeString& charsIn,
+ const UnicodeString& charsOut);
void expectContainment(const UnicodeSet& set,
const UnicodeString& setName,
const UnicodeString& charsIn,