From 68744138aaf7a33b632bad000f9fa509bc9cfdba Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Wed, 17 Oct 2001 19:21:12 +0000 Subject: [PATCH] ICU-1272 initial implementation of perl-ish character property syntax for UnicodeSet X-SVN-Rev: 6281 --- icu4c/source/i18n/Makefile.in | 2 +- icu4c/source/i18n/i18n.dsp | 4 + icu4c/source/i18n/quant.cpp | 6 + icu4c/source/i18n/rbt_pars.cpp | 42 +- icu4c/source/i18n/rbt_pars.h | 13 + icu4c/source/i18n/uniset.cpp | 476 +++++++------------ icu4c/source/i18n/upropset.cpp | 597 ++++++++++++++++++++++++ icu4c/source/i18n/upropset.h | 240 ++++++++++ icu4c/source/test/intltest/transtst.cpp | 18 +- icu4c/source/test/intltest/transtst.h | 5 + icu4c/source/test/intltest/usettest.cpp | 42 +- icu4c/source/test/intltest/usettest.h | 8 + 12 files changed, 1118 insertions(+), 335 deletions(-) create mode 100644 icu4c/source/i18n/upropset.cpp create mode 100644 icu4c/source/i18n/upropset.h diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index 4a2923c1fdc..b8dbb49fadb 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -71,7 +71,7 @@ cpdtrans.o hextouni.o rbt.o rbt_data.o rbt_pars.o rbt_rule.o rbt_set.o \ dbbi.o dbbi_tbl.o rbbi.o rbbi_tbl.o nultrans.o \ remtrans.o titletrn.o tolowtrn.o toupptrn.o xformtrn.o \ name2uni.o uni2name.o unitohex.o nortrans.o unifilt.o quant.o transreg.o \ -llong.o nfrs.o nfrule.o nfsubs.o rbnf.o +llong.o nfrs.o nfrule.o nfsubs.o rbnf.o upropset.o diff --git a/icu4c/source/i18n/i18n.dsp b/icu4c/source/i18n/i18n.dsp index bf394056b36..2e6055df047 100644 --- a/icu4c/source/i18n/i18n.dsp +++ b/icu4c/source/i18n/i18n.dsp @@ -370,6 +370,10 @@ SOURCE=.\unum.cpp # End Source File # Begin Source File +SOURCE=.\upropset.cpp +# End Source File +# Begin Source File + SOURCE=.\usearch.cpp # End Source File # Begin Source File diff --git a/icu4c/source/i18n/quant.cpp b/icu4c/source/i18n/quant.cpp index 25074f87514..f90951127a9 100644 --- a/icu4c/source/i18n/quant.cpp +++ b/icu4c/source/i18n/quant.cpp @@ -46,9 +46,15 @@ UMatchDegree Quantifier::matches(const Replaceable& text, int32_t start = offset; uint32_t count = 0; while (count < maxCount) { + int32_t pos = offset; UMatchDegree m = matcher->matches(text, offset, limit, incremental); if (m == U_MATCH) { ++count; + if (pos == offset) { + // If offset has not moved we have a zero-width match. + // Don't keep matching it infinitely. + break; + } } else if (incremental && m == U_PARTIAL_MATCH) { return U_PARTIAL_MATCH; } else { diff --git a/icu4c/source/i18n/rbt_pars.cpp b/icu4c/source/i18n/rbt_pars.cpp index fc575304d66..25b0fe9d4fa 100644 --- a/icu4c/source/i18n/rbt_pars.cpp +++ b/icu4c/source/i18n/rbt_pars.cpp @@ -41,8 +41,6 @@ #define SEGMENT_CLOSE ((UChar)0x0029) /*)*/ #define CONTEXT_ANTE ((UChar)0x007B) /*{*/ #define CONTEXT_POST ((UChar)0x007D) /*}*/ -#define SET_OPEN ((UChar)0x005B) /*[*/ -#define SET_CLOSE ((UChar)0x005D) /*]*/ #define CURSOR_POS ((UChar)0x007C) /*|*/ #define CURSOR_OFFSET ((UChar)0x0040) /*@*/ #define ANCHOR_START ((UChar)0x005E) /*^*/ @@ -50,6 +48,13 @@ #define ONE_OR_MORE ((UChar)0x002B) /*+*/ #define ZERO_OR_ONE ((UChar)0x003F) /*?*/ +#define DOT ((UChar)46) /*.*/ + +static const UChar DOT_SET[] = { // "[^[:Zp:][:Zl:]\r\n$]"; + 91, 94, 91, 58, 90, 112, 58, 93, 91, 58, 90, + 108, 58, 93, 92, 114, 92, 110, 36, 93, 0 +}; + // By definition, the ANCHOR_END special character is a // trailing SymbolTable.SYMBOL_REF character. // private static final char ANCHOR_END = '$'; @@ -514,6 +519,15 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) { // Text after a presumed end anchor is a syntax err return syntaxError(U_MALFORMED_VARIABLE_REFERENCE, rule, start); } + if (UnicodeSet::resemblesPattern(rule, pos-1)) { + pp.setIndex(pos-1); // Backup to opening '[' + buf.append(parser.parseSet(rule, pp)); + if (U_FAILURE(parser.status)) { + return syntaxError(U_MALFORMED_SET, rule, start); + } + pos = pp.getIndex(); + continue; + } // Handle escapes if (c == ESCAPE) { if (pos == limit) { @@ -653,14 +667,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) { } post = buf.length(); break; - case SET_OPEN: - pp.setIndex(pos-1); // Backup to opening '[' - buf.append(parser.parseSet(rule, pp)); - if (U_FAILURE(parser.status)) { - return syntaxError(U_MALFORMED_SET, rule, start); - } - pos = pp.getIndex(); - break; case CURSOR_POS: if (cursor >= 0) { return syntaxError(U_MULTIPLE_CURSORS, rule, start); @@ -689,6 +695,9 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) { } } break; + case DOT: + buf.append(parser.getDotStandIn()); + break; case KLEENE_STAR: case ONE_OR_MORE: case ZERO_OR_ONE: @@ -749,7 +758,6 @@ int32_t RuleHalf::parse(const UnicodeString& rule, int32_t pos, int32_t limit) { buf.append(parser.generateStandInFor(m)); } break; - // case SET_CLOSE: default: // Disallow unquoted characters other than [0-9A-Za-z] // in the printable ASCII range. These characters are @@ -892,6 +900,7 @@ void TransliteratorParser::parseRules(const UnicodeString& rules, } parseData->data = data; determineVariableRange(rules); + dotStandIn = (UChar) -1; UnicodeString str; // scratch idBlock.truncate(0); @@ -1257,6 +1266,17 @@ UChar TransliteratorParser::generateStandInFor(UnicodeMatcher* adopted) { return variableNext++; } +/** + * Return the stand-in for the dot set. It is allocated the first + * time and reused thereafter. + */ +UChar TransliteratorParser::getDotStandIn() { + if (dotStandIn == (UChar) -1) { + dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET, status)); + } + return dotStandIn; +} + /** * Append the value of the given variable name to the given * UnicodeString. diff --git a/icu4c/source/i18n/rbt_pars.h b/icu4c/source/i18n/rbt_pars.h index e650435d30c..ffe81b5dd7e 100644 --- a/icu4c/source/i18n/rbt_pars.h +++ b/icu4c/source/i18n/rbt_pars.h @@ -108,6 +108,13 @@ class TransliteratorParser { */ UnicodeString undefinedVariableName; + /** + * The stand-in character for the 'dot' set, represented by '.' in + * patterns. This is allocated the first time it is needed, and + * reused thereafter. + */ + UChar dotStandIn; + public: /** @@ -190,6 +197,12 @@ private: */ UChar generateStandInFor(UnicodeMatcher* adopted); + /** + * Return the stand-in for the dot set. It is allocated the first + * time and reused thereafter. + */ + UChar getDotStandIn(); + /** * Append the value of the given variable name to the given * UnicodeString. diff --git a/icu4c/source/i18n/uniset.cpp b/icu4c/source/i18n/uniset.cpp index fc4d3f90a51..850ffae200e 100644 --- a/icu4c/source/i18n/uniset.cpp +++ b/icu4c/source/i18n/uniset.cpp @@ -17,6 +17,7 @@ #include "rbt_rule.h" #include "umutex.h" #include "ucln_in.h" +#include "upropset.h" // HIGH_VALUE > all valid values. 110000 for codepoints #define UNICODESET_HIGH 0x0110000 @@ -42,49 +43,40 @@ #define UPPER_U ((UChar)0x0055) /*U*/ #define LOWER_U ((UChar)0x0075) /*u*/ -// N.B.: This mapping is different in ICU and Java -//const UnicodeString UnicodeSet::CATEGORY_NAMES( -// "CnLuLlLtLmLoMnMeMcNdNlNoZsZlZpCcCfCoCsPdPsPePcPoSmScSkSoPiPf", ""); -static const UChar CATEGORY_NAMES[] = { - 0x43, 0x6E, /* "Cn" */ - 0x4C, 0x75, /* "Lu" */ - 0x4C, 0x6C, /* "Ll" */ - 0x4C, 0x74, /* "Lt" */ - 0x4C, 0x6D, /* "Lm" */ - 0x4C, 0x6F, /* "Lo" */ - 0x4D, 0x6E, /* "Mn" */ - 0x4D, 0x65, /* "Me" */ - 0x4D, 0x63, /* "Mc" */ - 0x4E, 0x64, /* "Nd" */ - 0x4E, 0x6C, /* "Nl" */ - 0x4E, 0x6F, /* "No" */ - 0x5A, 0x73, /* "Zs" */ - 0x5A, 0x6C, /* "Zl" */ - 0x5A, 0x70, /* "Zp" */ - 0x43, 0x63, /* "Cc" */ - 0x43, 0x66, /* "Cf" */ - 0x43, 0x6F, /* "Co" */ - 0x43, 0x73, /* "Cs" */ - 0x50, 0x64, /* "Pd" */ - 0x50, 0x73, /* "Ps" */ - 0x50, 0x65, /* "Pe" */ - 0x50, 0x63, /* "Pc" */ - 0x50, 0x6F, /* "Po" */ - 0x53, 0x6D, /* "Sm" */ - 0x53, 0x63, /* "Sc" */ - 0x53, 0x6B, /* "Sk" */ - 0x53, 0x6F, /* "So" */ - 0x50, 0x69, /* "Pi" */ - 0x50, 0x66, /* "Pf" */ - 0x00 -}; - -/** - * A cache mapping character category integers, as returned by - * Unicode::getType(), to pairs strings. Entries are initially - * zero length and are filled in on demand. - */ -static UnicodeSet* CATEGORY_CACHE = NULL; +//// TEMPORARY: Remove when deprecated category code constructor is removed. +//static const UChar CATEGORY_NAMES[] = { +// 0x43, 0x6E, /* "Cn" */ +// 0x4C, 0x75, /* "Lu" */ +// 0x4C, 0x6C, /* "Ll" */ +// 0x4C, 0x74, /* "Lt" */ +// 0x4C, 0x6D, /* "Lm" */ +// 0x4C, 0x6F, /* "Lo" */ +// 0x4D, 0x6E, /* "Mn" */ +// 0x4D, 0x65, /* "Me" */ +// 0x4D, 0x63, /* "Mc" */ +// 0x4E, 0x64, /* "Nd" */ +// 0x4E, 0x6C, /* "Nl" */ +// 0x4E, 0x6F, /* "No" */ +// 0x5A, 0x73, /* "Zs" */ +// 0x5A, 0x6C, /* "Zl" */ +// 0x5A, 0x70, /* "Zp" */ +// 0x43, 0x63, /* "Cc" */ +// 0x43, 0x66, /* "Cf" */ +// 0x43, 0x6F, /* "Co" */ +// 0x43, 0x73, /* "Cs" */ +// 0x50, 0x64, /* "Pd" */ +// 0x50, 0x73, /* "Ps" */ +// 0x50, 0x65, /* "Pe" */ +// 0x50, 0x63, /* "Pc" */ +// 0x50, 0x6F, /* "Po" */ +// 0x53, 0x6D, /* "Sm" */ +// 0x53, 0x63, /* "Sc" */ +// 0x53, 0x6B, /* "Sk" */ +// 0x53, 0x6F, /* "So" */ +// 0x50, 0x69, /* "Pi" */ +// 0x50, 0x66, /* "Pf" */ +// 0x00 +//}; /** * Delimiter string used in patterns to close a category reference: @@ -92,16 +84,12 @@ static UnicodeSet* CATEGORY_CACHE = NULL; */ static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ - /** * Cleanup function for transliterator component; delegates to * Transliterator::cleanupRegistry(). */ U_CFUNC UBool unicodeset_cleanup(void) { - if (CATEGORY_CACHE) { - delete []CATEGORY_CACHE; - CATEGORY_CACHE = NULL; - } + UnicodePropertySet::cleanup(); return TRUE; } @@ -174,24 +162,24 @@ UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, applyPattern(pattern, pos, &symbols, status); } -/** - * Constructs a set from the given Unicode character category. - * @param category an integer indicating the character category as - * returned by Unicode::getType(). - */ -UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) : - len(0), capacity(START_EXTRA), bufferCapacity(0), list(0), - buffer(0) -{ - if (U_SUCCESS(status)) { - if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) { - status = U_ILLEGAL_ARGUMENT_ERROR; - } else { - list = new UChar32[capacity]; - *this = getCategorySet(category); - } - } -} +///** +// * Constructs a set from the given Unicode character category. +// * @param category an integer indicating the character category as +// * returned by Unicode::getType(). +// */ +//UnicodeSet::UnicodeSet(int8_t category, UErrorCode& status) : +// len(0), capacity(START_EXTRA), bufferCapacity(0), list(0), +// buffer(0) +//{ +// if (U_SUCCESS(status)) { +// if (category < 0 || category >= Unicode::GENERAL_TYPES_COUNT) { +// status = U_ILLEGAL_ARGUMENT_ERROR; +// } else { +// list = new UChar32[capacity]; +// *this = getCategorySet(category); +// } +// } +//} /** * Constructs a set that is identical to the given UnicodeSet. @@ -319,6 +307,16 @@ void UnicodeSet::applyPattern(const UnicodeString& pattern, } } +/** + * Return true if the given position, in the given pattern, appears + * to be the start of a UnicodeSet pattern. + */ +UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { + return ((pos+1) < pattern.length() && + pattern.charAt(pos) == (UChar)91/*[*/) || + UnicodePropertySet::resemblesPattern(pattern, pos); +} + /** * Append the toPattern() representation of a * character to the given StringBuffer. @@ -339,6 +337,8 @@ void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool useHexEscape) case COMPLEMENT: case INTERSECTION: case BACKSLASH: + case 123/*{*/: + case 125/*}*/: buf.append(BACKSLASH); break; default: @@ -451,15 +451,15 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, UBool escapeUnprintable) const { result.append(SET_OPEN); - // Check against the predefined categories. We implicitly build - // up ALL category sets the first time toPattern() is called. - for (int8_t cat=0; cat 1 && - catName.charAt(0) == COMPLEMENT); - if (invert) { - cat.remove(0, 1); - } - - UBool match = FALSE; - - // if we have two characters, search the category map for that - // code and either construct and return a UnicodeSet from the - // data in the category map or throw an exception - if (cat.length() == 2) { - int32_t i = 0; - int32_t numCategories = Unicode::GENERAL_TYPES_COUNT * 2; - - while (i < numCategories) - { - if (CATEGORY_NAMES[i] == cat.charAt(0) - && CATEGORY_NAMES[i+1] == cat.charAt(1)) - { - *this = getCategorySet((int8_t)(i/2)); - match = TRUE; - break; - } - i += 2; - } - } else if (cat.length() == 1) { - // if we have one character, search the category map for - // codes beginning with that letter, and union together - // all of the matching sets that we find (or throw an - // exception if there are no matches) - clear(); - for (int32_t i=0; i= 0) { - add((UChar32) start, (UChar32) end); - } - start = end = i; - } - } - } - if (start >= 0) { - add((UChar32) start, (UChar32) end); - } - } - } - - if (!match) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - - if (invert) { - complement(); - } -} - -/** - * Returns a pairs string for the given category. This string is - * cached and returned again if this method is called again with - * the same parameter. - */ -const UnicodeSet& UnicodeSet::getCategorySet(int8_t cat) { - // In order to tell what cache entries are empty, we assume - // every category specifies at least one character. Thus - // sets in the cache that are empty are uninitialized. - if (CATEGORY_CACHE == NULL) { - umtx_lock(NULL); - if (CATEGORY_CACHE == NULL) { - CATEGORY_CACHE = new UnicodeSet[Unicode::GENERAL_TYPES_COUNT]; - ucln_i18n_registerCleanup(); - } - umtx_unlock(NULL); - } - if (CATEGORY_CACHE[cat].isEmpty()) { - // Walk through all Unicode characters, noting the start - // and end of each range for which Character.getType(c) - // returns the given category integer. Since we are - // iterating in order, we can simply append the resulting - // ranges to the pairs string. - UnicodeSet& set = CATEGORY_CACHE[cat]; - int32_t start = -1; - int32_t end = -2; - // N.B.: There seems to be a bug that deadlocks if you - // call getType() with a supplemental character right now. - // TODO: Change 0xFFFF to MAX_VALUE later. - for (int32_t i=MIN_VALUE; i<=0xFFFF/*TEMPORARY*/; ++i) { - if (Unicode::getType((UChar)i) == cat) { - if ((end+1) == i) { - end = i; - } else { - if (start >= 0) { - set.add((UChar32)start, (UChar32)end); - } - start = end = i; - } - } - } - if (start >= 0) { - set.add((UChar32)start, (UChar32)end); - } - } - return CATEGORY_CACHE[cat]; -} - //---------------------------------------------------------------- // Implementation: Utility methods //---------------------------------------------------------------- diff --git a/icu4c/source/i18n/upropset.cpp b/icu4c/source/i18n/upropset.cpp new file mode 100644 index 00000000000..78e6f3f1528 --- /dev/null +++ b/icu4c/source/i18n/upropset.cpp @@ -0,0 +1,597 @@ +/* +********************************************************************** +* Copyright (c) 2001, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.cpp,v $ +* $Date: 2001/10/17 19:20:41 $ +* $Revision: 1.1 $ +********************************************************************** +*/ +#include "upropset.h" +#include "ustrfmt.h" +#include "unicode/unistr.h" +#include "unicode/uscript.h" +#include "unicode/uniset.h" +#include "unicode/parsepos.h" +#include "hash.h" + +U_NAMESPACE_BEGIN + +static Hashtable* NAME_MAP = NULL; + +static Hashtable* CATEGORY_MAP = NULL; + +/** + * A cache mapping character category integers, as returned by + * UCharacter.getType(), to sets. Entries are initially + * null and are created on demand. + */ +static UnicodeSet* CATEGORY_CACHE = NULL; + +/** + * A cache mapping script integers, as defined by + * UScript, to sets. Entries are initially + * null and are created on demand. + */ +static UnicodeSet* SCRIPT_CACHE = NULL; + +// Special value codes +static const int32_t ANY = -1; // general category: all code points + +//---------------------------------------------------------------- +// Unicode string and character constants +//---------------------------------------------------------------- + +static const UChar POSIX_OPEN[] = { 91,58,0 }; // "[:" +static const UChar POSIX_CLOSE[] = { 58,93,0 }; // ":]" + +static const UChar PERL_OPEN[] = { 92,112,0 }; // "\\p" +static const UChar PERL_CLOSE[] = { 125,0 }; // "}" + +static const UChar HAT = 0x005E; /*^*/ +static const UChar UPPER_P = 0x0050; /*P*/ +static const UChar LEFT_BRACE = 0x007B; /*{*/ +static const UChar EQUALS = 0x003D; /*=*/ + +//---------------------------------------------------------------------- +// class _CharString +// An identical class named CharString can be found in transreg.cpp. +// If we find ourselves needing another copy of this utility class we +// should probably pull it out into putil or some such place. +//---------------------------------------------------------------------- + +class _CharString { + public: + _CharString(const UnicodeString& str); + ~_CharString(); + operator char*() { return ptr; } + private: + char buf[128]; + char* ptr; +}; + +_CharString::_CharString(const UnicodeString& str) { + if (str.length() >= (int32_t)sizeof(buf)) { + ptr = new char[str.length() + 8]; + } else { + ptr = buf; + } + str.extract(0, 0x7FFFFFFF, ptr, ""); +} + +_CharString::~_CharString() { + if (ptr != buf) { + delete[] ptr; + } +} + +//---------------------------------------------------------------- +// Public API +//---------------------------------------------------------------- + +/** + * Return true if the given position, in the given pattern, appears + * to be the start of a property set pattern [:foo:], \p{foo}, or + * \P{foo}. + */ +UBool UnicodePropertySet::resemblesPattern(const UnicodeString& pattern, + int32_t pos) { + // Patterns are at least 5 characters long + if ((pos+5) > pattern.length()) { + return FALSE; + } + + // Look for an opening [:, [:^, \p, or \P + return (0 == pattern.compare(pos, 2, POSIX_OPEN)) || + (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT)); +} + +/** + * Create a UnicodeSet by parsing the given pattern at the given + * parse position. + * + * @param pattern the pattern string + * @param ppos on entry, the position at which to begin parsing. + * This shold be one of the locations marked '^': + * + * [:blah:] \p{blah} \P{blah} + * ^ % ^ % ^ % + * + * On return, the position after the last character parsed, that is, + * the locations marked '%'. If the parse fails, ppos is returned + * unchanged. + * @return a newly-constructed UnicodeSet object, or null upon + * failure. + */ +UnicodeSet* UnicodePropertySet::createFromPattern(const UnicodeString& pattern, + ParsePosition& ppos) { + init(); + + UnicodeSet* set = NULL; + + int32_t pos = ppos.getIndex(); + + // On entry, ppos should point to one of the following locations: + + // Minimum length is 5 characters, e.g. \p{L} + if ((pos+5) > pattern.length()) { + return NULL; + } + + UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} + UBool invert = FALSE; + + // Look for an opening [:, [:^, \p, or \P + if (0 == pattern.compare(pos, 2, POSIX_OPEN)) { + posix = TRUE; + pos = skipWhitespace(pattern, pos+2); + if (pos < pattern.length() && pattern.charAt(pos) == HAT) { + ++pos; + invert = TRUE; + } + } else if (0 == pattern.caseCompare(pos, 2, PERL_OPEN, U_FOLD_CASE_DEFAULT)) { + invert = (pattern.charAt(pos+1) == UPPER_P); + pos = skipWhitespace(pattern, pos+2); + if (pos == pattern.length() || pattern.charAt(pos++) != LEFT_BRACE) { + // Syntax error; "\p" or "\P" not followed by "{" + return NULL; + } + } else { + // Open delimiter not seen + return NULL; + } + + // Look for the matching close delimiter, either :] or } + int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos); + if (close < 0) { + // Syntax error; close delimiter missing + return NULL; + } + + // Look for an '=' sign. If this is present, we will parse a + // medium \p{gc=Cf} or long \p{GeneralCategory=Format} + // pattern. + int32_t equals = pattern.indexOf(EQUALS, pos); + if (equals >= 0 && equals < close) { + // Equals seen; parse medium/long pattern + UnicodeString typeName = munge(pattern, pos, equals); + UnicodeString valueName = munge(pattern, equals+1, close); + SetFactory factory; + factory = voidPtrToSetFactory(NAME_MAP->get(typeName)); + if (factory == NULL) { + // Syntax error; type name not recognized + return NULL; + } + set = (*factory)(valueName); + } else { + // No equals seen; parse short format \p{Cf} + UnicodeString shortName = munge(pattern, pos, close); + + // First try general category + set = createCategorySet(shortName); + + // If this fails, try script + if (set == NULL) { + set = createScriptSet(shortName); + } + } + + // Upon failure, return NULL with ppos unchanged + if (set == NULL) { + return NULL; + } + + if (invert) { + set->complement(); + } + + // Move to the limit position after the close delimiter + ppos.setIndex(close + (posix ? 2 : 1)); + + return set; +} + +//---------------------------------------------------------------- +// Property set factory static methods +// NOTE: This will change/go away when we implement UCharacter +// based property retrieval. +//---------------------------------------------------------------- + +static UBool _numericValueFilter(UChar32 c, void* context) { + int32_t value = * (int32_t*) context; + // TODO: Change this to a more generic function, like + // u_charNumericValue (when one exists). + return u_charDigitValue(c) == value; +} + +UnicodeSet* UnicodePropertySet::createNumericValueSet(const UnicodeString& valueName) { + _CharString cvalueName(valueName); + UnicodeSet* set = new UnicodeSet(); + char* end; + double value = uprv_strtod(cvalueName, &end); + int32_t ivalue = (int32_t) value; + if (ivalue != value || ivalue < 0 || *end != 0) { + // UCharacter doesn't support negative or non-integral + // values, so just return an empty set + return set; + } + initSetFromFilter(*set, _numericValueFilter, &ivalue); + return set; +} + +/** + * Given a general category value name, create a corresponding + * set and return it, or return null if the name is invalid. + * @param valueName a pre-munged general category value name + */ +UnicodeSet* UnicodePropertySet::createCategorySet(const UnicodeString& valueName) { + int32_t valueCode = CATEGORY_MAP->geti(valueName); + if (valueCode == 0) { + return NULL; + } + + UnicodeSet* set = new UnicodeSet(); + if (valueCode == ANY) { + set->complement(); + return set; + } + for (int32_t cat=0; cataddAll(getCategorySet(cat)); + } + } + return set; +} + +/** + * Given a script value name, create a corresponding set and + * return it, or return null if the name is invalid. + * @param valueName a pre-munged script value name + */ +UnicodeSet* UnicodePropertySet::createScriptSet(const UnicodeString& valueName) { + _CharString cvalueName(valueName); + UErrorCode ec = U_ZERO_ERROR; + UScriptCode script = uscript_getCode(cvalueName, &ec); + if (script == USCRIPT_INVALID_CODE || U_FAILURE(ec)) { + // Syntax error; unknown short name + return NULL; + } + return new UnicodeSet(getScriptSet(script)); +} + +//---------------------------------------------------------------- +// Utility methods +//---------------------------------------------------------------- + +static UBool _categoryFilter(UChar32 c, void* context) { + int32_t value = * (int32_t*) context; + return u_charType(c) == value; +} + +/** + * Returns a UnicodeSet for the given category. This set is + * cached and returned again if this method is called again with + * the same parameter. + * + * Callers MUST NOT MODIFY the returned set. + */ +const UnicodeSet& UnicodePropertySet::getCategorySet(int32_t cat) { + if (CATEGORY_CACHE[cat].isEmpty()) { + initSetFromFilter(CATEGORY_CACHE[cat], _categoryFilter, &cat); + } + return CATEGORY_CACHE[cat]; +} + +static UBool _scriptFilter(UChar32 c, void* context) { + UScriptCode value = * (UScriptCode*) context; + UErrorCode ec = U_ZERO_ERROR; + return uscript_getScript(c, &ec) == value; +} + +/** + * Returns a UnicodeSet for the given script. This set is + * cached and returned again if this method is called again with + * the same parameter. + * + * Callers MUST NOT MODIFY the returned set. + */ +const UnicodeSet& UnicodePropertySet::getScriptSet(UScriptCode script) { + if (SCRIPT_CACHE[script].isEmpty()) { + initSetFromFilter(CATEGORY_CACHE[script], _scriptFilter, &script); + } + return SCRIPT_CACHE[script]; +} + +/** + * Given a string, munge it to lost the whitespace. So "General + * Category " becomes "GeneralCategory". We munge all type and value + * strings, and store all type and value keys pre-munged. NOTE: + * Unlike the Java version, we do not modify the case, since we use a + * case-insensitive compare function. + */ +UnicodeString UnicodePropertySet::munge(const UnicodeString& str, + int32_t start, int32_t limit) { + UnicodeString buf; + for (int32_t i=start; i= 0) { + set.add((UChar32)start, (UChar32)end); + } + start = end = i; + } + } + } + if (start >= 0) { + set.add((UChar32)start, (UChar32)end); + } +} + +//---------------------------------------------------------------- +// Type and value name maps +//---------------------------------------------------------------- + +/** + * Add a type mapping to the name map. + */ +void UnicodePropertySet::addType(const UnicodeString& shortName, + const UnicodeString& longName, + SetFactory factory) { + UErrorCode ec = U_ZERO_ERROR; + void* p = setFactoryToVoidPtr(factory); + NAME_MAP->put(shortName, p, ec); + NAME_MAP->put(longName, p, ec); +} + +/** + * Add a value mapping to the name map. + */ +void UnicodePropertySet::addValue(Hashtable* map, + const UnicodeString& shortName, + const UnicodeString& longName, + int32_t value) { + // assert(value != 0); + UErrorCode ec = U_ZERO_ERROR; + map->puti(shortName, value, ec); + if (longName.length() != 0) { + map->puti(longName, value, ec); + } +} + +void UnicodePropertySet::init() { + if (NAME_MAP != NULL) { + return; + } + + NAME_MAP = new Hashtable(TRUE); + CATEGORY_MAP = new Hashtable(TRUE); + CATEGORY_CACHE = new UnicodeSet[U_CHAR_CATEGORY_COUNT]; + SCRIPT_CACHE = new UnicodeSet[USCRIPT_CODE_LIMIT]; + + // NOTE: We munge all search keys to have no whitespace + // and upper case. As such, all stored keys should have + // this format. + + // Load the map with type data + + addType("GC", "GENERALCATEGORY", createCategorySet); + + //addType("CC", "COMBININGCLASS", COMBINING_CLASS); + //addType("BC", "BIDICLASS", BIDI_CLASS); + //addType("DT", "DECOMPOSITIONTYPE", DECOMPOSITION_TYPE); + + addType("NV", "NUMERICVALUE", createNumericValueSet); + + //addType("NT", "NUMERICTYPE", NUMERIC_TYPE); + //addType("EA", "EASTASIANWIDTH", EAST_ASIAN_WIDTH); + //addType("LB", "LINEBREAK", LINE_BREAK); + //addType("JT", "JOININGTYPE", JOINING_TYPE); + + addType("SC", "SCRIPT", createScriptSet); + + // Load the map with value data + + // General Category + + addValue(CATEGORY_MAP, "ANY", "", ANY); // special case + + addValue(CATEGORY_MAP, "C", "OTHER", + (1 << U_CONTROL_CHAR) | + (1 << U_FORMAT_CHAR) | + (1 << U_GENERAL_OTHER_TYPES) | + (1 << U_PRIVATE_USE_CHAR) | + (1 << U_SURROGATE)); + + addValue(CATEGORY_MAP, "CC", "CONTROL", + 1 << U_CONTROL_CHAR); + addValue(CATEGORY_MAP, "CF", "FORMAT", + 1 << U_FORMAT_CHAR); + addValue(CATEGORY_MAP, "CN", "UNASSIGNED", + 1 << U_GENERAL_OTHER_TYPES); + addValue(CATEGORY_MAP, "CO", "PRIVATEUSE", + 1 << U_PRIVATE_USE_CHAR); + addValue(CATEGORY_MAP, "CS", "SURROGATE", + 1 << U_SURROGATE); + + addValue(CATEGORY_MAP, "L", "LETTER", + (1 << U_LOWERCASE_LETTER) | + (1 << U_MODIFIER_LETTER) | + (1 << U_OTHER_LETTER) | + (1 << U_TITLECASE_LETTER) | + (1 << U_UPPERCASE_LETTER)); + + addValue(CATEGORY_MAP, "LL", "LOWERCASELETTER", + 1 << U_LOWERCASE_LETTER); + addValue(CATEGORY_MAP, "LM", "MODIFIERLETTER", + 1 << U_MODIFIER_LETTER); + addValue(CATEGORY_MAP, "LO", "OTHERLETTER", + 1 << U_OTHER_LETTER); + addValue(CATEGORY_MAP, "LT", "TITLECASELETTER", + 1 << U_TITLECASE_LETTER); + addValue(CATEGORY_MAP, "LU", "UPPERCASELETTER", + 1 << U_UPPERCASE_LETTER); + + addValue(CATEGORY_MAP, "M", "MARK", + (1 << U_NON_SPACING_MARK) | + (1 << U_COMBINING_SPACING_MARK) | + (1 << U_ENCLOSING_MARK)); + + addValue(CATEGORY_MAP, "MN", "NONSPACINGMARK", + 1 << U_NON_SPACING_MARK); + addValue(CATEGORY_MAP, "MC", "SPACINGMARK", + 1 << U_COMBINING_SPACING_MARK); + addValue(CATEGORY_MAP, "ME", "ENCLOSINGMARK", + 1 << U_ENCLOSING_MARK); + + addValue(CATEGORY_MAP, "N", "NUMBER", + (1 << U_DECIMAL_DIGIT_NUMBER) | + (1 << U_LETTER_NUMBER) | + (1 << U_OTHER_NUMBER)); + + addValue(CATEGORY_MAP, "ND", "DECIMALNUMBER", + 1 << U_DECIMAL_DIGIT_NUMBER); + addValue(CATEGORY_MAP, "NL", "LETTERNUMBER", + 1 << U_LETTER_NUMBER); + addValue(CATEGORY_MAP, "NO", "OTHERNUMBER", + 1 << U_OTHER_NUMBER); + + addValue(CATEGORY_MAP, "P", "PUNCTUATION", + (1 << U_CONNECTOR_PUNCTUATION) | + (1 << U_DASH_PUNCTUATION) | + (1 << U_END_PUNCTUATION) | + (1 << U_FINAL_PUNCTUATION) | + (1 << U_INITIAL_PUNCTUATION) | + (1 << U_OTHER_PUNCTUATION) | + (1 << U_START_PUNCTUATION)); + + addValue(CATEGORY_MAP, "PC", "CONNECTORPUNCTUATION", + 1 << U_CONNECTOR_PUNCTUATION); + addValue(CATEGORY_MAP, "PD", "DASHPUNCTUATION", + 1 << U_DASH_PUNCTUATION); + addValue(CATEGORY_MAP, "PE", "ENDPUNCTUATION", + 1 << U_END_PUNCTUATION); + addValue(CATEGORY_MAP, "PF", "FINALPUNCTUATION", + 1 << U_FINAL_PUNCTUATION); + addValue(CATEGORY_MAP, "PI", "INITIALPUNCTUATION", + 1 << U_INITIAL_PUNCTUATION); + addValue(CATEGORY_MAP, "PO", "OTHERPUNCTUATION", + 1 << U_OTHER_PUNCTUATION); + addValue(CATEGORY_MAP, "PS", "STARTPUNCTUATION", + 1 << U_START_PUNCTUATION); + + addValue(CATEGORY_MAP, "S", "SYMBOL", + (1 << U_CURRENCY_SYMBOL) | + (1 << U_MODIFIER_SYMBOL) | + (1 << U_MATH_SYMBOL) | + (1 << U_OTHER_SYMBOL)); + + addValue(CATEGORY_MAP, "SC", "CURRENCYSYMBOL", + 1 << U_CURRENCY_SYMBOL); + addValue(CATEGORY_MAP, "SK", "MODIFIERSYMBOL", + 1 << U_MODIFIER_SYMBOL); + addValue(CATEGORY_MAP, "SM", "MATHSYMBOL", + 1 << U_MATH_SYMBOL); + addValue(CATEGORY_MAP, "SO", "OTHERSYMBOL", + 1 << U_OTHER_SYMBOL); + + addValue(CATEGORY_MAP, "Z", "SEPARATOR", + (1 << U_LINE_SEPARATOR) | + (1 << U_PARAGRAPH_SEPARATOR) | + (1 << U_SPACE_SEPARATOR)); + + addValue(CATEGORY_MAP, "ZL", "LINESEPARATOR", + 1 << U_LINE_SEPARATOR); + addValue(CATEGORY_MAP, "ZP", "PARAGRAPHSEPARATOR", + 1 << U_PARAGRAPH_SEPARATOR); + addValue(CATEGORY_MAP, "ZS", "SPACESEPARATOR", + 1 << U_SPACE_SEPARATOR); +} + +void UnicodePropertySet::cleanup() { + if (NAME_MAP != NULL) { + delete NAME_MAP; NAME_MAP = NULL; + delete CATEGORY_MAP; CATEGORY_MAP = NULL; + delete[] CATEGORY_CACHE; CATEGORY_CACHE = NULL; + delete[] SCRIPT_CACHE; SCRIPT_CACHE = NULL; + } +} + +U_NAMESPACE_END + +//eof diff --git a/icu4c/source/i18n/upropset.h b/icu4c/source/i18n/upropset.h new file mode 100644 index 00000000000..f4bacb5907f --- /dev/null +++ b/icu4c/source/i18n/upropset.h @@ -0,0 +1,240 @@ +/* +********************************************************************** +* Copyright (c) 2001, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +* $Source: /xsrl/Nsvn/icu/icu/source/i18n/Attic/upropset.h,v $ +* $Date: 2001/10/17 19:20:41 $ +* $Revision: 1.1 $ +********************************************************************** +*/ +#ifndef _UPROPSET_H_ +#define _UPROPSET_H_ + +#include "unicode/utypes.h" +#include "unicode/uscript.h" + +U_NAMESPACE_BEGIN + +class UnicodeString; +class UnicodeSet; +class ParsePosition; +class Hashtable; + +/** + * INTERNAL CLASS implementing the UnicodeSet properties as outlined + * at: + * + * http://oss.software.ibm.com/cvs/icu/~checkout~/icuhtml/design/unicodeset_properties.html + * + * Recognized syntax: + * + * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]" + * \p{foo} \P{foo} - white space not allowed within "\p" or "\P" + * + * Other than the above restrictions, white space is ignored. Case + * is ignored except in "\p" and "\P". + * + * This class cannot be instantiated. It has a public static method, + * createPropertySet(), with takes a pattern to be parsed and returns + * a new UnicodeSet. Another public static method, + * resemblesPattern(), returns true if a given pattern string appears + * to be a property set pattern, and therefore should be passed in to + * createPropertySet(). + * + * NOTE: Current implementation is incomplete. The following list + * indicates which properties are supported. + * + * + GeneralCategory + * CombiningClass + * BidiClass + * DecompositionType + * + NumericValue + * NumericType + * EastAsianWidth + * LineBreak + * JoiningType + * + Script + * + * '+' indicates a supported property. + * + * @author Alan Liu + * @version $RCSfile: upropset.h,v $ $Revision: 1.1 $ $Date: 2001/10/17 19:20:41 $ + */ +class UnicodePropertySet { + + public: + + //---------------------------------------------------------------- + // Public API + //---------------------------------------------------------------- + + /** + * Return true if the given position, in the given pattern, appears + * to be the start of a property set pattern [:foo:], \p{foo}, or + * \P{foo}. + */ + static UBool resemblesPattern(const UnicodeString& pattern, int32_t pos); + + /** + * Create a UnicodeSet by parsing the given pattern at the given + * parse position. + * + * @param pattern the pattern string + * @param ppos on entry, the position at which to begin parsing. + * This shold be one of the locations marked '^': + * + * [:blah:] \p{blah} \P{blah} + * ^ % ^ % ^ % + * + * On return, the position after the last character parsed, that is, + * the locations marked '%'. If the parse fails, ppos is returned + * unchanged. + * @return a newly-constructed UnicodeSet object, or null upon + * failure. + */ + static UnicodeSet* createFromPattern(const UnicodeString& pattern, + ParsePosition& ppos); + + private: + + //---------------------------------------------------------------- + // Property set factory static methods + // NOTE: This will change/go away when we implement UCharacter + // based property retrieval. + //---------------------------------------------------------------- + + typedef UnicodeSet* (*SetFactory)(const UnicodeString& valueName); + + static UnicodeSet* createNumericValueSet(const UnicodeString& valueName); + + /** + * Given a general category value name, create a corresponding + * set and return it, or return null if the name is invalid. + * @param valueName a pre-munged general category value name + */ + static UnicodeSet* createCategorySet(const UnicodeString& valueName); + + /** + * Given a script value name, create a corresponding set and + * return it, or return null if the name is invalid. + * @param valueName a pre-munged script value name + */ + static UnicodeSet* createScriptSet(const UnicodeString& valueName); + + //---------------------------------------------------------------- + // Utility methods + //---------------------------------------------------------------- + + /** + * Returns a UnicodeSet for the given category. This set is + * cached and returned again if this method is called again with + * the same parameter. + * + * Callers MUST NOT MODIFY the returned set. + */ + static const UnicodeSet& getCategorySet(int32_t cat); + + /** + * Returns a UnicodeSet for the given script. This set is + * cached and returned again if this method is called again with + * the same parameter. + * + * Callers MUST NOT MODIFY the returned set. + */ + static const UnicodeSet& getScriptSet(UScriptCode script); + + /** + * Given a string, munge it to upper case and lose the whitespace. + * So "General Category " becomes "GENERALCATEGORY". We munge all + * type and value strings, and store all type and value keys + * pre-munged. + */ + static UnicodeString munge(const UnicodeString& str, int32_t start, int32_t limit); + + /** + * Skip over a sequence of zero or more white space characters + * at pos. Return the index of the first non-white-space character + * at or after pos, or str.length(), if there is none. + */ + static int32_t skipWhitespace(const UnicodeString& str, int32_t pos); + + //---------------------------------------------------------------- + // Generic filter-based scanning code + // + // NOTE: In general, we don't want to do this! This is a temporary + // implementation until we have time for something that examines + // the underlying UCharacter data structures in an intelligent + // way. Iterating over all code points is dumb. What we want to + // do, for instance, is iterate over internally-stored ranges + // of characters that have a given property. + //---------------------------------------------------------------- + + /** + * A filter that returns TRUE if the given code point should be + * included in the UnicodeSet being constructed. + */ + typedef UBool (*Filter)(UChar32 codePoint, void* context); + + /** + * Set the given UnicodeSet to contain all code points for which + * filter returns TRUE. The context parameter is passed unchanged + * to the filter function. + */ + static void initSetFromFilter(UnicodeSet& set, Filter filter, + void* context); + + //---------------------------------------------------------------- + // Type and value name maps + //---------------------------------------------------------------- + + /** + * Add a type mapping to the name map. + */ + static void addType(const UnicodeString& shortName, + const UnicodeString& longName, + SetFactory factory); + + /** + * Add a value mapping to the name map. + */ + static void addValue(Hashtable* map, + const UnicodeString& shortName, + const UnicodeString& longName, + int32_t value); + + static void init(); + + public: + static void cleanup(); + + private: + //---------------------------------------------------------------- + // SetFactory <=> void* + // I don't know why the compiler won't cast between these types. + // They should be interconvertible. Does C++ distinguish between + // pointers into code and pointers into data? In any case, we + // convert between these types in a safe way here. + //---------------------------------------------------------------- + + union SetFactoryTok { + void* voidPointer; + SetFactory functionPointer; + }; + + inline static void* setFactoryToVoidPtr(SetFactory f) { + SetFactoryTok tok; + tok.functionPointer = f; + return tok.voidPointer; + } + + inline static SetFactory voidPtrToSetFactory(void* p) { + SetFactoryTok tok; + tok.voidPointer = p; + return tok.functionPointer; + } +}; + +U_NAMESPACE_END + +#endif diff --git a/icu4c/source/test/intltest/transtst.cpp b/icu4c/source/test/intltest/transtst.cpp index b2d9334816e..d1a863cbb16 100644 --- a/icu4c/source/test/intltest/transtst.cpp +++ b/icu4c/source/test/intltest/transtst.cpp @@ -124,10 +124,11 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec, TESTCASE(42,TestUndefinedVariable); TESTCASE(43,TestEmptyContext); TESTCASE(44,TestCompoundFilterID); - TESTCASE(45,TestDevanagariLatinRT); - TESTCASE(46,TestTeluguLatinRT); - TESTCASE(47,TestCompoundLatinRT); - TESTCASE(48,TestSanskritLatinRT); + TESTCASE(45,TestPropertySet); + TESTCASE(46,TestDevanagariLatinRT); + TESTCASE(47,TestTeluguLatinRT); + TESTCASE(48,TestCompoundLatinRT); + TESTCASE(49,TestSanskritLatinRT); default: name = ""; break; } } @@ -2066,6 +2067,15 @@ void TransliteratorTest::TestCompoundFilterID(void) { } } +/** + * Test new property set syntax + */ +void TransliteratorTest::TestPropertySet() { + expect("a>A; \\p{Lu}>x; \\p{ANY}>y;", "abcDEF", "Ayyxxx"); + expect("(.+)>'[' $1 ']';", " a stitch \n in time \r saves 9", + "[ a stitch ]\n[ in time ]\r[ saves 9]"); +} + //====================================================================== // Ram's tests //====================================================================== diff --git a/icu4c/source/test/intltest/transtst.h b/icu4c/source/test/intltest/transtst.h index 5f888d6cac8..e22a907cdbf 100644 --- a/icu4c/source/test/intltest/transtst.h +++ b/icu4c/source/test/intltest/transtst.h @@ -210,6 +210,11 @@ class TransliteratorTest : public IntlTest { */ void TestCompoundFilterID(void); + /** + * Test new property set syntax + */ + void TestPropertySet(void); + /* Devanagari-Latin rules Test */ void TestDevanagariLatinRT(void); diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 8b450e67121..8970b9b7335 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -41,7 +41,8 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec, CASE(4,TestMinimalRep); CASE(5,TestAPI); CASE(6,TestScriptSet); - CASE(7,TestExhaustive); + CASE(7,TestPropertySet); + CASE(8,TestExhaustive); default: name = ""; break; } } @@ -103,16 +104,18 @@ UnicodeSetTest::TestCategories(void) { void UnicodeSetTest::TestCloneEqualHash(void) { UErrorCode status = U_ZERO_ERROR; - int8_t category=Unicode::LOWERCASE_LETTER; - UnicodeSet *set1=new UnicodeSet(category, status); // :Li: Letter, lowercase + //int8_t category=Unicode::LOWERCASE_LETTER; + //UnicodeSet *set1=new UnicodeSet(category, status); // :Li: Letter, lowercase + UnicodeSet *set1=new UnicodeSet("[:Ll:]", status); // Letter, lowercase if (U_FAILURE(status)){ - errln((UnicodeString)"FAIL: Can't construst set with cateegory->Ll"); + errln((UnicodeString)"FAIL: Can't construst set with category->Ll"); return; } - category=Unicode::DECIMAL_DIGIT_NUMBER; - UnicodeSet *set2=new UnicodeSet(category, status); //Number, Decimal digit + //category=Unicode::DECIMAL_DIGIT_NUMBER; + //UnicodeSet *set2=new UnicodeSet(category, status); //Number, Decimal digit + UnicodeSet *set2=new UnicodeSet("[:Nd:]", status); //Number, Decimal digit if (U_FAILURE(status)){ - errln((UnicodeString)"FAIL: Can't construct set with cateegory->Nd"); + errln((UnicodeString)"FAIL: Can't construct set with category->Nd"); return; } @@ -407,6 +410,22 @@ void UnicodeSetTest::TestScriptSet() { expectContainment(set2, "[:Greek:]", CharsToUnicodeString("\\u0391\\u03B1"), "aA"); } +/** + * Test the [:Latin:] syntax. + */ +void UnicodeSetTest::TestPropertySet() { + UErrorCode status = U_ZERO_ERROR; + UnicodeSet set("[:Latin:]", status); + if (U_FAILURE(status)) { errln("FAIL"); return; } + expectContainment(set, "aA", CharsToUnicodeString("\\u0391\\u03B1")); + set.applyPattern("[\\p{Greek}]", status); + if (U_FAILURE(status)) { errln("FAIL"); return; } + expectContainment(set, CharsToUnicodeString("\\u0391\\u03B1"), "aA"); + set.applyPattern("\\P{ GENERAL Category = upper case letter }", status); + if (U_FAILURE(status)) { errln("FAIL"); return; } + expectContainment(set, "abc", "ABC"); +} + void UnicodeSetTest::TestExhaustive() { // exhaustive tests. Simulate UnicodeSets with integers. // That gives us very solid tests (except for large memory tests). @@ -569,6 +588,15 @@ UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) { return pairs; } +void +UnicodeSetTest::expectContainment(const UnicodeSet& set, + const UnicodeString& charsIn, + const UnicodeString& charsOut) { + UnicodeString pat; + set.toPattern(pat); + expectContainment(set, pat, charsIn, charsOut); +} + void UnicodeSetTest::expectContainment(const UnicodeSet& set, const UnicodeString& setName, diff --git a/icu4c/source/test/intltest/usettest.h b/icu4c/source/test/intltest/usettest.h index 71b1aee927d..bac8c41fa09 100644 --- a/icu4c/source/test/intltest/usettest.h +++ b/icu4c/source/test/intltest/usettest.h @@ -41,6 +41,11 @@ private: void TestScriptSet(void); + /** + * Test the [:Latin:] syntax. + */ + void TestPropertySet(void); + void TestExhaustive(void); private: @@ -79,6 +84,9 @@ private: */ static UnicodeString getPairs(const UnicodeSet& set); + void expectContainment(const UnicodeSet& set, + const UnicodeString& charsIn, + const UnicodeString& charsOut); void expectContainment(const UnicodeSet& set, const UnicodeString& setName, const UnicodeString& charsIn,