diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index cf0799aed14..2025b85ee11 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -111,7 +111,8 @@ util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o dtintrv.o ucnvsel.o p ulist.o uloc_tag.o icudataver.o icuplug.o listformatter.o ulistformatter.o \ sharedobject.o simpleformatter.o unifiedcache.o uloc_keytype.o \ ubiditransform.o \ -pluralmap.o +pluralmap.o \ +numparse_unisets.o ## Header files to install HEADERS = $(srcdir)/unicode/*.h diff --git a/icu4c/source/common/numparse_unisets.cpp b/icu4c/source/common/numparse_unisets.cpp new file mode 100644 index 00000000000..3aa5b5b1e5c --- /dev/null +++ b/icu4c/source/common/numparse_unisets.cpp @@ -0,0 +1,200 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT + +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + +#include "numparse_unisets.h" +#include "umutex.h" +#include "ucln_cmn.h" +#include "unicode/uniset.h" +#include "uresimp.h" +#include "cstring.h" +#include "uassert.h" + +using namespace icu; +using namespace icu::numparse; +using namespace icu::numparse::impl; +using namespace icu::numparse::impl::unisets; + + +namespace { + +static UnicodeSet* gUnicodeSets[COUNT] = {}; + +UnicodeSet* computeUnion(Key k1, Key k2) { + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + return nullptr; + } + result->addAll(*gUnicodeSets[k1]); + result->addAll(*gUnicodeSets[k2]); + result->freeze(); + return result; +} + +UnicodeSet* computeUnion(Key k1, Key k2, Key k3) { + UnicodeSet* result = new UnicodeSet(); + if (result == nullptr) { + return nullptr; + } + result->addAll(*gUnicodeSets[k1]); + result->addAll(*gUnicodeSets[k2]); + result->addAll(*gUnicodeSets[k3]); + result->freeze(); + return result; +} + + +void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) { + // assert unicodeSets.get(key) == null; + gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status); +} + +class ParseDataSink : public ResourceSink { + public: + void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE { + ResourceTable contextsTable = value.getTable(status); + if (U_FAILURE(status)) { return; } + for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { + if (uprv_strcmp(key, "date") == 0) { + // ignore + } else { + ResourceTable strictnessTable = value.getTable(status); + if (U_FAILURE(status)) { return; } + for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { + bool isLenient = (uprv_strcmp(key, "lenient") == 0); + ResourceArray array = value.getArray(status); + if (U_FAILURE(status)) { return; } + for (int k = 0; k < array.getSize(); k++) { + array.getValue(k, value); + UnicodeString str = value.getUnicodeString(status); + if (U_FAILURE(status)) { return; } + // There is both lenient and strict data for comma/period, + // but not for any of the other symbols. + if (str.indexOf(u'.') != -1) { + saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status); + } else if (str.indexOf(u',') != -1) { + saveSet(isLenient ? COMMA : STRICT_COMMA, str, status); + } else if (str.indexOf(u'+') != -1) { + saveSet(PLUS_SIGN, str, status); + } else if (str.indexOf(u'‒') != -1) { + saveSet(MINUS_SIGN, str, status); + } else if (str.indexOf(u'$') != -1) { + saveSet(DOLLAR_SIGN, str, status); + } else if (str.indexOf(u'£') != -1) { + saveSet(POUND_SIGN, str, status); + } else if (str.indexOf(u'₨') != -1) { + saveSet(RUPEE_SIGN, str, status); + } + if (U_FAILURE(status)) { return; } + } + } + } + } + } +}; + + +icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER; + +UBool U_CALLCONV cleanupNumberParseUniSets() { + for (int32_t i = 0; i < COUNT; i++) { + delete gUnicodeSets[i]; + gUnicodeSets[i] = nullptr; + } + return TRUE; +} + +void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { + ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets); + + gUnicodeSets[EMPTY] = new UnicodeSet(); + + // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309. + // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). + gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet( + u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status); + gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status); + + LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status)); + if (U_FAILURE(status)) { return; } + ParseDataSink sink; + ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status); + if (U_FAILURE(status)) { return; } + + // TODO: Should there be fallback behavior if for some reason these sets didn't get populated? + U_ASSERT(gUnicodeSets[COMMA] != nullptr); + U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr); + U_ASSERT(gUnicodeSets[PERIOD] != nullptr); + U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr); + + gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet( + u"['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status); + gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS); + gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion( + STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS); + + U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr); + U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr); + + gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status); + gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status); + gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status); + + U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr); + U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr); + U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr); + gUnicodeSets[YEN_SIGN] = new UnicodeSet(u"[¥\\uffe5]", status); + + gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status); + + gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); + gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS); + + for (int32_t i = 0; i < COUNT; i++) { + gUnicodeSets[i]->freeze(); + } +} + +} + +const UnicodeSet* unisets::get(Key key) { + UErrorCode localStatus = U_ZERO_ERROR; + umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus); + if (U_FAILURE(localStatus)) { + // TODO: This returns non-null in Java, and callers assume that. + return nullptr; + } + return gUnicodeSets[key]; +} + +Key unisets::chooseFrom(UnicodeString str, Key key1) { + return get(key1)->contains(str) ? key1 : COUNT; +} + +Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) { + return get(key1)->contains(str) ? key1 : chooseFrom(str, key2); +} + +//Key unisets::chooseCurrency(UnicodeString str) { +// if (get(DOLLAR_SIGN)->contains(str)) { +// return DOLLAR_SIGN; +// } else if (get(POUND_SIGN)->contains(str)) { +// return POUND_SIGN; +// } else if (get(RUPEE_SIGN)->contains(str)) { +// return RUPEE_SIGN; +// } else if (get(YEN_SIGN)->contains(str)) { +// return YEN_SIGN; +// } else { +// return COUNT; +// } +//} + + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_unisets.h b/icu4c/source/common/numparse_unisets.h similarity index 71% rename from icu4c/source/i18n/numparse_unisets.h rename to icu4c/source/common/numparse_unisets.h index 97a44ea860d..7cf3f6aeb15 100644 --- a/icu4c/source/i18n/numparse_unisets.h +++ b/icu4c/source/common/numparse_unisets.h @@ -1,14 +1,16 @@ // © 2018 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html +// This file is in common instead of i18n because it is needed by ucurr.cpp. + #include "unicode/utypes.h" #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT #ifndef __NUMPARSE_UNISETS_H__ #define __NUMPARSE_UNISETS_H__ -#include "numparse_types.h" #include "unicode/uniset.h" +#include "unicode/unistr.h" U_NAMESPACE_BEGIN namespace numparse { namespace impl { @@ -18,8 +20,6 @@ enum Key { EMPTY, // Ignorables - BIDI, - WHITESPACE, DEFAULT_IGNORABLES, STRICT_IGNORABLES, @@ -29,7 +29,7 @@ enum Key { // - PERIOD is a superset of SCRICT_PERIOD // - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS // - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS - COMMA, + COMMA, PERIOD, STRICT_COMMA, STRICT_PERIOD, @@ -38,23 +38,27 @@ enum Key { STRICT_ALL_SEPARATORS, // Symbols - // TODO: NaN? - MINUS_SIGN, + MINUS_SIGN, PLUS_SIGN, PERCENT_SIGN, PERMILLE_SIGN, INFINITY_KEY, // INFINITY is defined in cmath + // Currency Symbols + DOLLAR_SIGN, + POUND_SIGN, + RUPEE_SIGN, + YEN_SIGN, // not in CLDR data, but Currency.java wants it + // Other - DIGITS, - CWCF, + DIGITS, // Combined Separators with Digits (for lead code points) - DIGITS_OR_ALL_SEPARATORS, + DIGITS_OR_ALL_SEPARATORS, DIGITS_OR_STRICT_ALL_SEPARATORS, // The number of elements in the enum. Also used to indicate null. - COUNT + COUNT }; const UnicodeSet* get(Key key); @@ -63,6 +67,19 @@ Key chooseFrom(UnicodeString str, Key key1); Key chooseFrom(UnicodeString str, Key key1, Key key2); +// Unused in C++: +// Key chooseCurrency(UnicodeString str); +// Used instead: +static const struct { + Key key; + UChar32 exemplar; +} kCurrencyEntries[] = { + {DOLLAR_SIGN, u'$'}, + {POUND_SIGN, u'£'}, + {RUPEE_SIGN, u'₨'}, + {YEN_SIGN, u'¥'}, +}; + } // namespace unisets } // namespace impl } // namespace numparse diff --git a/icu4c/source/common/ucln_cmn.h b/icu4c/source/common/ucln_cmn.h index 5db94945172..9b6c2058135 100644 --- a/icu4c/source/common/ucln_cmn.h +++ b/icu4c/source/common/ucln_cmn.h @@ -33,6 +33,7 @@ Please keep the order of enums declared in same order as the cleanup functions are suppose to be called. */ typedef enum ECleanupCommonType { UCLN_COMMON_START = -1, + UCLN_COMMON_NUMPARSE_UNISETS, UCLN_COMMON_USPREP, UCLN_COMMON_BREAKITERATOR, UCLN_COMMON_RBBI, diff --git a/icu4c/source/common/ucurr.cpp b/icu4c/source/common/ucurr.cpp index 6ce53c2d5e5..1fd02ec30b5 100644 --- a/icu4c/source/common/ucurr.cpp +++ b/icu4c/source/common/ucurr.cpp @@ -17,11 +17,13 @@ #include "unicode/ustring.h" #include "unicode/parsepos.h" #include "unicode/uniset.h" +#include "unicode/usetiter.h" #include "unicode/utf16.h" #include "ustr_imp.h" #include "charstr.h" #include "cmemory.h" #include "cstring.h" +#include "numparse_unisets.h" #include "uassert.h" #include "umutex.h" #include "ucln_cmn.h" @@ -67,14 +69,6 @@ static const int32_t POW10[] = { 1, 10, 100, 1000, 10000, 100000, static const int32_t MAX_POW10 = UPRV_LENGTHOF(POW10) - 1; -// Defines equivalent currency symbols. -static const char *EQUIV_CURRENCY_SYMBOLS[][2] = { - {"\\u00a5", "\\uffe5"}, - {"$", "\\ufe69"}, - {"$", "\\uff04"}, - {"\\u20a8", "\\u20b9"}, - {"\\u00a3", "\\u20a4"}}; - #define ISO_CURRENCY_CODE_LENGTH 3 //------------------------------------------------------------ @@ -2207,16 +2201,21 @@ static void U_CALLCONV initIsoCodes(UErrorCode &status) { } static void populateCurrSymbolsEquiv(icu::Hashtable *hash, UErrorCode &status) { - if (U_FAILURE(status)) { - return; - } - int32_t length = UPRV_LENGTHOF(EQUIV_CURRENCY_SYMBOLS); - for (int32_t i = 0; i < length; ++i) { - icu::UnicodeString lhs(EQUIV_CURRENCY_SYMBOLS[i][0], -1, US_INV); - icu::UnicodeString rhs(EQUIV_CURRENCY_SYMBOLS[i][1], -1, US_INV); - makeEquivalent(lhs.unescape(), rhs.unescape(), hash, status); - if (U_FAILURE(status)) { - return; + using namespace icu::numparse::impl; + if (U_FAILURE(status)) { return; } + for (auto& entry : unisets::kCurrencyEntries) { + UnicodeString exemplar(entry.exemplar); + const UnicodeSet* set = unisets::get(entry.key); + if (set == nullptr) { return; } + UnicodeSetIterator it(*set); + while (it.next()) { + UnicodeString value = it.getString(); + if (value == exemplar) { + // No need to mark the exemplar character as an equivalent + continue; + } + makeEquivalent(exemplar, value, hash, status); + if (U_FAILURE(status)) { return; } } } } diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index b4fafdf72f8..a66b65a8744 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -92,7 +92,7 @@ csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8. wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o zonemeta.o \ standardplural.o upluralrules.o plurrule.o plurfmt.o selfmt.o dtitvfmt.o dtitvinf.o udateintervalformat.o \ tmunit.o tmutamt.o tmutfmt.o currpinf.o \ -uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o decfmtst.o smpdtfst.o \ +uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o smpdtfst.o \ ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o ufieldpositer.o \ decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \ tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o \ @@ -107,7 +107,7 @@ number_mapper.o number_multiplier.o number_currencysymbols.o number_skeletons.o double-conversion.o double-conversion-bignum-dtoa.o double-conversion-bignum.o \ double-conversion-cached-powers.o double-conversion-diy-fp.o \ double-conversion-fast-dtoa.o double-conversion-strtod.o \ -numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o numparse_impl.o \ +numparse_stringsegment.o numparse_parsednumber.o numparse_impl.o \ numparse_symbols.o numparse_decimal.o numparse_scientific.o numparse_currency.o \ numparse_affixes.o numparse_compositions.o numparse_validators.o \ diff --git a/icu4c/source/i18n/decfmtst.cpp b/icu4c/source/i18n/decfmtst.cpp deleted file mode 100644 index e939ab474ad..00000000000 --- a/icu4c/source/i18n/decfmtst.cpp +++ /dev/null @@ -1,251 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* Copyright (C) 2009-2016, International Business Machines Corporation and -* others. All Rights Reserved. -******************************************************************************* -* -* This file contains the class DecimalFormatStaticSets -* -* DecimalFormatStaticSets holds the UnicodeSets that are needed for lenient -* parsing of decimal and group separators. -******************************************************************************** -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_FORMATTING - -#include "unicode/unistr.h" -#include "unicode/uniset.h" -#include "unicode/uchar.h" -#include "cmemory.h" -#include "cstring.h" -#include "uassert.h" -#include "ucln_in.h" -#include "umutex.h" - -#include "decfmtst.h" - -U_NAMESPACE_BEGIN - - -//------------------------------------------------------------------------------ -// -// Unicode Set pattern strings for all of the required constant sets. -// Initialized with hex values for portability to EBCDIC based machines. -// Really ugly, but there's no good way to avoid it. -// -//------------------------------------------------------------------------------ - -static const UChar gDotEquivalentsPattern[] = { - // [ . \u2024 \u3002 \uFE12 \uFE52 \uFF0E \uFF61 ] - 0x005B, 0x002E, 0x2024, 0x3002, 0xFE12, 0xFE52, 0xFF0E, 0xFF61, 0x005D, 0x0000}; - -static const UChar gCommaEquivalentsPattern[] = { - // [ , \u060C \u066B \u3001 \uFE10 \uFE11 \uFE50 \uFE51 \uFF0C \uFF64 ] - 0x005B, 0x002C, 0x060C, 0x066B, 0x3001, 0xFE10, 0xFE11, 0xFE50, 0xFE51, 0xFF0C, 0xFF64, 0x005D, 0x0000}; - -static const UChar gOtherGroupingSeparatorsPattern[] = { - // [ \ SPACE ' NBSP \u066C \u2000 - \u200A \u2018 \u2019 \u202F \u205F \u3000 \uFF07 ] - 0x005B, 0x005C, 0x0020, 0x0027, 0x00A0, 0x066C, 0x2000, 0x002D, 0x200A, 0x2018, 0x2019, 0x202F, 0x205F, 0x3000, 0xFF07, 0x005D, 0x0000}; - -static const UChar gDashEquivalentsPattern[] = { - // [ \ - HYPHEN F_DASH N_DASH MINUS ] - 0x005B, 0x005C, 0x002D, 0x2010, 0x2012, 0x2013, 0x2212, 0x005D, 0x0000}; - -static const UChar gStrictDotEquivalentsPattern[] = { - // [ . \u2024 \uFE52 \uFF0E \uFF61 ] - 0x005B, 0x002E, 0x2024, 0xFE52, 0xFF0E, 0xFF61, 0x005D, 0x0000}; - -static const UChar gStrictCommaEquivalentsPattern[] = { - // [ , \u066B \uFE10 \uFE50 \uFF0C ] - 0x005B, 0x002C, 0x066B, 0xFE10, 0xFE50, 0xFF0C, 0x005D, 0x0000}; - -static const UChar gStrictOtherGroupingSeparatorsPattern[] = { - // [ \ SPACE ' NBSP \u066C \u2000 - \u200A \u2018 \u2019 \u202F \u205F \u3000 \uFF07 ] - 0x005B, 0x005C, 0x0020, 0x0027, 0x00A0, 0x066C, 0x2000, 0x002D, 0x200A, 0x2018, 0x2019, 0x202F, 0x205F, 0x3000, 0xFF07, 0x005D, 0x0000}; - -static const UChar gStrictDashEquivalentsPattern[] = { - // [ \ - MINUS ] - 0x005B, 0x005C, 0x002D, 0x2212, 0x005D, 0x0000}; - -static const UChar32 gMinusSigns[] = { - 0x002D, - 0x207B, - 0x208B, - 0x2212, - 0x2796, - 0xFE63, - 0xFF0D}; - -static const UChar32 gPlusSigns[] = { - 0x002B, - 0x207A, - 0x208A, - 0x2795, - 0xfB29, - 0xFE62, - 0xFF0B}; - -static void initUnicodeSet(const UChar32 *raw, int32_t len, UnicodeSet *s) { - for (int32_t i = 0; i < len; ++i) { - s->add(raw[i]); - } -} - -DecimalFormatStaticSets::DecimalFormatStaticSets(UErrorCode &status) -: fDotEquivalents(NULL), - fCommaEquivalents(NULL), - fOtherGroupingSeparators(NULL), - fDashEquivalents(NULL), - fStrictDotEquivalents(NULL), - fStrictCommaEquivalents(NULL), - fStrictOtherGroupingSeparators(NULL), - fStrictDashEquivalents(NULL), - fDefaultGroupingSeparators(NULL), - fStrictDefaultGroupingSeparators(NULL), - fMinusSigns(NULL), - fPlusSigns(NULL) -{ - fDotEquivalents = new UnicodeSet(UnicodeString(TRUE, gDotEquivalentsPattern, -1), status); - fCommaEquivalents = new UnicodeSet(UnicodeString(TRUE, gCommaEquivalentsPattern, -1), status); - fOtherGroupingSeparators = new UnicodeSet(UnicodeString(TRUE, gOtherGroupingSeparatorsPattern, -1), status); - fDashEquivalents = new UnicodeSet(UnicodeString(TRUE, gDashEquivalentsPattern, -1), status); - - fStrictDotEquivalents = new UnicodeSet(UnicodeString(TRUE, gStrictDotEquivalentsPattern, -1), status); - fStrictCommaEquivalents = new UnicodeSet(UnicodeString(TRUE, gStrictCommaEquivalentsPattern, -1), status); - fStrictOtherGroupingSeparators = new UnicodeSet(UnicodeString(TRUE, gStrictOtherGroupingSeparatorsPattern, -1), status); - fStrictDashEquivalents = new UnicodeSet(UnicodeString(TRUE, gStrictDashEquivalentsPattern, -1), status); - - - fDefaultGroupingSeparators = new UnicodeSet(*fDotEquivalents); - fDefaultGroupingSeparators->addAll(*fCommaEquivalents); - fDefaultGroupingSeparators->addAll(*fOtherGroupingSeparators); - - fStrictDefaultGroupingSeparators = new UnicodeSet(*fStrictDotEquivalents); - fStrictDefaultGroupingSeparators->addAll(*fStrictCommaEquivalents); - fStrictDefaultGroupingSeparators->addAll(*fStrictOtherGroupingSeparators); - - fMinusSigns = new UnicodeSet(); - fPlusSigns = new UnicodeSet(); - - // Check for null pointers - if (fDotEquivalents == NULL || fCommaEquivalents == NULL || fOtherGroupingSeparators == NULL || fDashEquivalents == NULL || - fStrictDotEquivalents == NULL || fStrictCommaEquivalents == NULL || fStrictOtherGroupingSeparators == NULL || fStrictDashEquivalents == NULL || - fDefaultGroupingSeparators == NULL || fStrictOtherGroupingSeparators == NULL || - fMinusSigns == NULL || fPlusSigns == NULL) { - cleanup(); - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - - initUnicodeSet( - gMinusSigns, - UPRV_LENGTHOF(gMinusSigns), - fMinusSigns); - initUnicodeSet( - gPlusSigns, - UPRV_LENGTHOF(gPlusSigns), - fPlusSigns); - - // Freeze all the sets - fDotEquivalents->freeze(); - fCommaEquivalents->freeze(); - fOtherGroupingSeparators->freeze(); - fDashEquivalents->freeze(); - fStrictDotEquivalents->freeze(); - fStrictCommaEquivalents->freeze(); - fStrictOtherGroupingSeparators->freeze(); - fStrictDashEquivalents->freeze(); - fDefaultGroupingSeparators->freeze(); - fStrictDefaultGroupingSeparators->freeze(); - fMinusSigns->freeze(); - fPlusSigns->freeze(); -} - -DecimalFormatStaticSets::~DecimalFormatStaticSets() { - cleanup(); -} - -void DecimalFormatStaticSets::cleanup() { // Be sure to clean up newly added fields! - delete fDotEquivalents; fDotEquivalents = NULL; - delete fCommaEquivalents; fCommaEquivalents = NULL; - delete fOtherGroupingSeparators; fOtherGroupingSeparators = NULL; - delete fDashEquivalents; fDashEquivalents = NULL; - delete fStrictDotEquivalents; fStrictDotEquivalents = NULL; - delete fStrictCommaEquivalents; fStrictCommaEquivalents = NULL; - delete fStrictOtherGroupingSeparators; fStrictOtherGroupingSeparators = NULL; - delete fStrictDashEquivalents; fStrictDashEquivalents = NULL; - delete fDefaultGroupingSeparators; fDefaultGroupingSeparators = NULL; - delete fStrictDefaultGroupingSeparators; fStrictDefaultGroupingSeparators = NULL; - delete fStrictOtherGroupingSeparators; fStrictOtherGroupingSeparators = NULL; - delete fMinusSigns; fMinusSigns = NULL; - delete fPlusSigns; fPlusSigns = NULL; -} - -static DecimalFormatStaticSets *gStaticSets; -static icu::UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER; - - -//------------------------------------------------------------------------------ -// -// decfmt_cleanup Memory cleanup function, free/delete all -// cached memory. Called by ICU's u_cleanup() function. -// -//------------------------------------------------------------------------------ -U_CDECL_BEGIN -static UBool U_CALLCONV -decimfmt_cleanup(void) -{ - delete gStaticSets; - gStaticSets = NULL; - gStaticSetsInitOnce.reset(); - return TRUE; -} - -static void U_CALLCONV initSets(UErrorCode &status) { - U_ASSERT(gStaticSets == NULL); - ucln_i18n_registerCleanup(UCLN_I18N_DECFMT, decimfmt_cleanup); - gStaticSets = new DecimalFormatStaticSets(status); - if (U_FAILURE(status)) { - delete gStaticSets; - gStaticSets = NULL; - return; - } - if (gStaticSets == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } -} -U_CDECL_END - -const DecimalFormatStaticSets *DecimalFormatStaticSets::getStaticSets(UErrorCode &status) { - umtx_initOnce(gStaticSetsInitOnce, initSets, status); - return gStaticSets; -} - - -const UnicodeSet *DecimalFormatStaticSets::getSimilarDecimals(UChar32 decimal, UBool strictParse) -{ - UErrorCode status = U_ZERO_ERROR; - umtx_initOnce(gStaticSetsInitOnce, initSets, status); - if (U_FAILURE(status)) { - return NULL; - } - - if (gStaticSets->fDotEquivalents->contains(decimal)) { - return strictParse ? gStaticSets->fStrictDotEquivalents : gStaticSets->fDotEquivalents; - } - - if (gStaticSets->fCommaEquivalents->contains(decimal)) { - return strictParse ? gStaticSets->fStrictCommaEquivalents : gStaticSets->fCommaEquivalents; - } - - // if there is no match, return NULL - return NULL; -} - - -U_NAMESPACE_END -#endif // !UCONFIG_NO_FORMATTING diff --git a/icu4c/source/i18n/decfmtst.h b/icu4c/source/i18n/decfmtst.h deleted file mode 100644 index 63ae50c6df9..00000000000 --- a/icu4c/source/i18n/decfmtst.h +++ /dev/null @@ -1,69 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* Copyright (C) 2009-2016, International Business Machines Corporation and -* others. All Rights Reserved. -******************************************************************************* -* -* This file contains declarations for the class DecimalFormatStaticSets -* -* DecimalFormatStaticSets holds the UnicodeSets that are needed for lenient -* parsing of decimal and group separators. -******************************************************************************** -*/ - -#ifndef DECFMTST_H -#define DECFMTST_H - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_FORMATTING - -#include "unicode/uobject.h" - -U_NAMESPACE_BEGIN - -class UnicodeSet; - - -class DecimalFormatStaticSets : public UMemory -{ -public: - // Constructor and Destructor not for general use. - // Public to permit access from plain C implementation functions. - DecimalFormatStaticSets(UErrorCode &status); - ~DecimalFormatStaticSets(); - - /** - * Return a pointer to a lazy-initialized singleton instance of this class. - */ - static const DecimalFormatStaticSets *getStaticSets(UErrorCode &status); - - static const UnicodeSet *getSimilarDecimals(UChar32 decimal, UBool strictParse); - - UnicodeSet *fDotEquivalents; - UnicodeSet *fCommaEquivalents; - UnicodeSet *fOtherGroupingSeparators; - UnicodeSet *fDashEquivalents; - - UnicodeSet *fStrictDotEquivalents; - UnicodeSet *fStrictCommaEquivalents; - UnicodeSet *fStrictOtherGroupingSeparators; - UnicodeSet *fStrictDashEquivalents; - - UnicodeSet *fDefaultGroupingSeparators; - UnicodeSet *fStrictDefaultGroupingSeparators; - - UnicodeSet *fMinusSigns; - UnicodeSet *fPlusSigns; -private: - void cleanup(); - -}; - - -U_NAMESPACE_END - -#endif // !UCONFIG_NO_FORMATTING -#endif // DECFMTST_H diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp deleted file mode 100644 index eb2f6c1e9d5..00000000000 --- a/icu4c/source/i18n/numparse_unisets.cpp +++ /dev/null @@ -1,127 +0,0 @@ -// © 2018 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT - -// Allow implicit conversion from char16_t* to UnicodeString for this file: -// Helpful in toString methods and elsewhere. -#define UNISTR_FROM_STRING_EXPLICIT - -#include "numparse_unisets.h" -#include "numparse_types.h" -#include "umutex.h" -#include "ucln_in.h" -#include "unicode/uniset.h" - -using namespace icu; -using namespace icu::numparse; -using namespace icu::numparse::impl; -using namespace icu::numparse::impl::unisets; - - -namespace { - -static UnicodeSet* gUnicodeSets[COUNT] = {}; - -UnicodeSet* computeUnion(Key k1, Key k2) { - UnicodeSet* result = new UnicodeSet(); - if (result == nullptr) { - return nullptr; - } - result->addAll(*gUnicodeSets[k1]); - result->addAll(*gUnicodeSets[k2]); - result->freeze(); - return result; -} - -UnicodeSet* computeUnion(Key k1, Key k2, Key k3) { - UnicodeSet* result = new UnicodeSet(); - if (result == nullptr) { - return nullptr; - } - result->addAll(*gUnicodeSets[k1]); - result->addAll(*gUnicodeSets[k2]); - result->addAll(*gUnicodeSets[k3]); - result->freeze(); - return result; -} - -icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER; - -UBool U_CALLCONV cleanupNumberParseUniSets() { - for (int32_t i = 0; i < COUNT; i++) { - delete gUnicodeSets[i]; - gUnicodeSets[i] = nullptr; - } - return TRUE; -} - -void U_CALLCONV initNumberParseUniSets(UErrorCode& status) { - ucln_i18n_registerCleanup(UCLN_I18N_NUMPARSE_UNISETS, cleanupNumberParseUniSets); - - gUnicodeSets[EMPTY] = new UnicodeSet(); - - // These characters are skipped over and ignored at any point in the string, even in strict mode. - // See ticket #13084. - gUnicodeSets[BIDI] = new UnicodeSet(u"[[:DI:]]", status); - - // This set was decided after discussion with icu-design@. See ticket #13309. - // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). - gUnicodeSets[WHITESPACE] = new UnicodeSet(u"[[:Zs:][\\u0009]]", status); - - gUnicodeSets[DEFAULT_IGNORABLES] = computeUnion(BIDI, WHITESPACE); - gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(*gUnicodeSets[BIDI]); - - // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while. - gUnicodeSets[COMMA] = new UnicodeSet(u"[,،٫、︐︑﹐﹑,、]", status); - gUnicodeSets[STRICT_COMMA] = new UnicodeSet(u"[,٫︐﹐,]", status); - gUnicodeSets[PERIOD] = new UnicodeSet(u"[.․。︒﹒.。]", status); - gUnicodeSets[STRICT_PERIOD] = new UnicodeSet(u"[.․﹒.。]", status); - gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet( - u"['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status); - gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS); - gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion( - STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS); - - gUnicodeSets[MINUS_SIGN] = new UnicodeSet(u"[-⁻₋−➖﹣-]", status); - gUnicodeSets[PLUS_SIGN] = new UnicodeSet(u"[+⁺₊➕﬩﹢+]", status); - - gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status); - gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status); - gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status); - - gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status); - gUnicodeSets[CWCF] = new UnicodeSet(u"[:CWCF:]", status); - - gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS); - gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS); - - for (int32_t i = 0; i < COUNT; i++) { - gUnicodeSets[i]->freeze(); - } -} - -} - -const UnicodeSet* unisets::get(Key key) { - UErrorCode localStatus = U_ZERO_ERROR; - umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus); - if (U_FAILURE(localStatus)) { - // TODO: This returns non-null in Java, and callers assume that. - return nullptr; - } - return gUnicodeSets[key]; -} - -Key unisets::chooseFrom(UnicodeString str, Key key1) { - return get(key1)->contains(str) ? key1 : COUNT; -} - -Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) { - return get(key1)->contains(str) ? key1 : chooseFrom(str, key2); -} - - -#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/scientificnumberformatter.cpp b/icu4c/source/i18n/scientificnumberformatter.cpp index adf032d989d..a63f15f6fb6 100644 --- a/icu4c/source/i18n/scientificnumberformatter.cpp +++ b/icu4c/source/i18n/scientificnumberformatter.cpp @@ -15,8 +15,8 @@ #include "unicode/fpositer.h" #include "unicode/utf16.h" #include "unicode/uniset.h" -#include "decfmtst.h" #include "unicode/decimfmt.h" +#include "numparse_unisets.h" U_NAMESPACE_BEGIN @@ -129,7 +129,6 @@ UnicodeString &ScientificNumberFormatter::SuperscriptStyle::format( const UnicodeString &original, FieldPositionIterator &fpi, const UnicodeString &preExponent, - const DecimalFormatStaticSets &staticSets, UnicodeString &appendTo, UErrorCode &status) const { if (U_FAILURE(status)) { @@ -149,16 +148,17 @@ UnicodeString &ScientificNumberFormatter::SuperscriptStyle::format( break; case UNUM_EXPONENT_SIGN_FIELD: { + using namespace icu::numparse::impl; int32_t beginIndex = fp.getBeginIndex(); int32_t endIndex = fp.getEndIndex(); UChar32 aChar = original.char32At(beginIndex); - if (staticSets.fMinusSigns->contains(aChar)) { + if (unisets::get(unisets::MINUS_SIGN)->contains(aChar)) { appendTo.append( original, copyFromOffset, beginIndex - copyFromOffset); appendTo.append(kSuperscriptMinusSign); - } else if (staticSets.fPlusSigns->contains(aChar)) { + } else if (unisets::get(unisets::PLUS_SIGN)->contains(aChar)) { appendTo.append( original, copyFromOffset, @@ -203,7 +203,6 @@ UnicodeString &ScientificNumberFormatter::MarkupStyle::format( const UnicodeString &original, FieldPositionIterator &fpi, const UnicodeString &preExponent, - const DecimalFormatStaticSets & /*unusedDecimalFormatSets*/, UnicodeString &appendTo, UErrorCode &status) const { if (U_FAILURE(status)) { @@ -243,8 +242,7 @@ ScientificNumberFormatter::ScientificNumberFormatter( DecimalFormat *fmtToAdopt, Style *styleToAdopt, UErrorCode &status) : fPreExponent(), fDecimalFormat(fmtToAdopt), - fStyle(styleToAdopt), - fStaticSets(NULL) { + fStyle(styleToAdopt) { if (U_FAILURE(status)) { return; } @@ -258,7 +256,6 @@ ScientificNumberFormatter::ScientificNumberFormatter( return; } getPreExponent(*sym, fPreExponent); - fStaticSets = DecimalFormatStaticSets::getStaticSets(status); } ScientificNumberFormatter::ScientificNumberFormatter( @@ -266,8 +263,7 @@ ScientificNumberFormatter::ScientificNumberFormatter( : UObject(other), fPreExponent(other.fPreExponent), fDecimalFormat(NULL), - fStyle(NULL), - fStaticSets(other.fStaticSets) { + fStyle(NULL) { fDecimalFormat = static_cast( other.fDecimalFormat->clone()); fStyle = other.fStyle->clone(); @@ -292,7 +288,6 @@ UnicodeString &ScientificNumberFormatter::format( original, fpi, fPreExponent, - *fStaticSets, appendTo, status); } diff --git a/icu4c/source/i18n/ucln_in.h b/icu4c/source/i18n/ucln_in.h index dc447ca8987..318eafc143c 100644 --- a/icu4c/source/i18n/ucln_in.h +++ b/icu4c/source/i18n/ucln_in.h @@ -27,7 +27,6 @@ It's usually best to have child dependencies called first. */ typedef enum ECleanupI18NType { UCLN_I18N_START = -1, UCLN_I18N_NUMBER_SKELETONS, - UCLN_I18N_NUMPARSE_UNISETS, UCLN_I18N_CURRENCY_SPACING, UCLN_I18N_SPOOF, UCLN_I18N_SPOOFDATA, diff --git a/icu4c/source/i18n/unicode/scientificnumberformatter.h b/icu4c/source/i18n/unicode/scientificnumberformatter.h index 15023d5141a..6c34d2ce29a 100644 --- a/icu4c/source/i18n/unicode/scientificnumberformatter.h +++ b/icu4c/source/i18n/unicode/scientificnumberformatter.h @@ -24,7 +24,6 @@ U_NAMESPACE_BEGIN class FieldPositionIterator; -class DecimalFormatStaticSets; class DecimalFormatSymbols; class DecimalFormat; class Formattable; @@ -150,7 +149,6 @@ public: const UnicodeString &original, FieldPositionIterator &fpi, const UnicodeString &preExponent, - const DecimalFormatStaticSets &decimalFormatSets, UnicodeString &appendTo, UErrorCode &status) const = 0; private: @@ -165,7 +163,6 @@ public: const UnicodeString &original, FieldPositionIterator &fpi, const UnicodeString &preExponent, - const DecimalFormatStaticSets &decimalFormatSets, UnicodeString &appendTo, UErrorCode &status) const; }; @@ -184,7 +181,6 @@ public: const UnicodeString &original, FieldPositionIterator &fpi, const UnicodeString &preExponent, - const DecimalFormatStaticSets &decimalFormatSets, UnicodeString &appendTo, UErrorCode &status) const; private: @@ -211,7 +207,6 @@ public: UnicodeString fPreExponent; DecimalFormat *fDecimalFormat; Style *fStyle; - const DecimalFormatStaticSets *fStaticSets; }; diff --git a/icu4c/source/test/intltest/numfmtst.cpp b/icu4c/source/test/intltest/numfmtst.cpp index 0d07750a4ea..0e259e0d292 100644 --- a/icu4c/source/test/intltest/numfmtst.cpp +++ b/icu4c/source/test/intltest/numfmtst.cpp @@ -1412,7 +1412,7 @@ static const char *lenientAffixTestCases[] = { static const char *lenientMinusTestCases[] = { "-5", "\\u22125", - "\\u20105" + "\\u27965" }; static const char *lenientCurrencyTestCases[] = { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java index cba2dc93849..0148b36347d 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/UnicodeSetStaticCache.java @@ -5,7 +5,13 @@ package com.ibm.icu.impl.number.parse; import java.util.EnumMap; import java.util.Map; +import com.ibm.icu.impl.ICUData; +import com.ibm.icu.impl.ICUResourceBundle; +import com.ibm.icu.impl.UResource; +import com.ibm.icu.impl.UResource.Value; import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.ULocale; +import com.ibm.icu.util.UResourceBundle; /** * This class statically initializes UnicodeSets useful for number parsing. Microbenchmarks show this to @@ -20,8 +26,6 @@ import com.ibm.icu.text.UnicodeSet; public class UnicodeSetStaticCache { public static enum Key { // Ignorables - BIDI, - WHITESPACE, DEFAULT_IGNORABLES, STRICT_IGNORABLES, @@ -47,9 +51,14 @@ public class UnicodeSetStaticCache { PERMILLE_SIGN, INFINITY, + // Currency Symbols + DOLLAR_SIGN, + POUND_SIGN, + RUPEE_SIGN, + YEN_SIGN, // not in CLDR data, but Currency.java wants it + // Other DIGITS, - CWCF, // TODO: Check if this is being used and remove it if not. // Combined Separators with Digits (for lead code points) DIGITS_OR_ALL_SEPARATORS, @@ -70,6 +79,20 @@ public class UnicodeSetStaticCache { return get(key1).contains(str) ? key1 : chooseFrom(str, key2); } + public static Key chooseCurrency(String str) { + if (get(Key.DOLLAR_SIGN).contains(str)) { + return Key.DOLLAR_SIGN; + } else if (get(Key.POUND_SIGN).contains(str)) { + return Key.POUND_SIGN; + } else if (get(Key.RUPEE_SIGN).contains(str)) { + return Key.RUPEE_SIGN; + } else if (get(Key.YEN_SIGN).contains(str)) { + return Key.YEN_SIGN; + } else { + return null; + } + } + private static UnicodeSet computeUnion(Key k1, Key k2) { return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze(); } @@ -78,23 +101,98 @@ public class UnicodeSetStaticCache { return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze(); } + private static void saveSet(Key key, String unicodeSetPattern) { + assert unicodeSets.get(key) == null; + unicodeSets.put(key, new UnicodeSet(unicodeSetPattern).freeze()); + } + + /* + parse{ + date{ + lenient{ + "[\\--/]", + "[\\:∶]", + } + } + general{ + lenient{ + "[.․。︒﹒.。]", + "[\$﹩$$]", + "[£₤]", + "[₨₹{Rp}{Rs}]", + } + } + number{ + lenient{ + "[\\-‒⁻₋−➖﹣-]", + "[,،٫、︐︑﹐﹑,、]", + "[+⁺₊➕﬩﹢+]", + } + stricter{ + "[,٫︐﹐,]", + "[.․﹒.。]", + } + } + } + */ + static class ParseDataSink extends UResource.Sink { + @Override + public void put(com.ibm.icu.impl.UResource.Key key, Value value, boolean noFallback) { + UResource.Table contextsTable = value.getTable(); + for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) { + if (key.contentEquals("date")) { + // ignore + } else { + assert key.contentEquals("general") || key.contentEquals("number"); + UResource.Table strictnessTable = value.getTable(); + for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) { + boolean isLenient = key.contentEquals("lenient"); + UResource.Array array = value.getArray(); + for (int k = 0; k < array.getSize(); k++) { + array.getValue(k, value); + String str = value.toString(); + // There is both lenient and strict data for comma/period, + // but not for any of the other symbols. + if (str.indexOf('.') != -1) { + saveSet(isLenient ? Key.PERIOD : Key.STRICT_PERIOD, str); + } else if (str.indexOf(',') != -1) { + saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str); + } else if (str.indexOf('+') != -1) { + saveSet(Key.PLUS_SIGN, str); + } else if (str.indexOf('‒') != -1) { + saveSet(Key.MINUS_SIGN, str); + } else if (str.indexOf('$') != -1) { + saveSet(Key.DOLLAR_SIGN, str); + } else if (str.indexOf('£') != -1) { + saveSet(Key.POUND_SIGN, str); + } else if (str.indexOf('₨') != -1) { + saveSet(Key.RUPEE_SIGN, str); + } + } + } + } + } + } + } + static { - // These characters are skipped over and ignored at any point in the string, even in strict mode. - // See ticket #13084. - unicodeSets.put(Key.BIDI, new UnicodeSet("[[:DI:]]").freeze()); - - // This set was decided after discussion with icu-design@. See ticket #13309. + // These sets were decided after discussion with icu-design@. See tickets #13084 and #13309. // Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property). - unicodeSets.put(Key.WHITESPACE, new UnicodeSet("[[:Zs:][\\u0009]]").freeze()); + unicodeSets.put(Key.DEFAULT_IGNORABLES, + new UnicodeSet("[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze()); + unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze()); - unicodeSets.put(Key.DEFAULT_IGNORABLES, computeUnion(Key.BIDI, Key.WHITESPACE)); - unicodeSets.put(Key.STRICT_IGNORABLES, get(Key.BIDI)); + // CLDR provides data for comma, period, minus sign, and plus sign. + ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle + .getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.ROOT); + rb.getAllItemsWithFallback("parse", new ParseDataSink()); + + // TODO: Should there be fallback behavior if for some reason these sets didn't get populated? + assert unicodeSets.containsKey(Key.COMMA); + assert unicodeSets.containsKey(Key.STRICT_COMMA); + assert unicodeSets.containsKey(Key.PERIOD); + assert unicodeSets.containsKey(Key.STRICT_PERIOD); - // TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while. - unicodeSets.put(Key.COMMA, new UnicodeSet("[,،٫、︐︑﹐﹑,、]").freeze()); - unicodeSets.put(Key.STRICT_COMMA, new UnicodeSet("[,٫︐﹐,]").freeze()); - unicodeSets.put(Key.PERIOD, new UnicodeSet("[.․。︒﹒.。]").freeze()); - unicodeSets.put(Key.STRICT_PERIOD, new UnicodeSet("[.․﹒.。]").freeze()); unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS, new UnicodeSet("['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]").freeze()); unicodeSets.put(Key.ALL_SEPARATORS, @@ -102,15 +200,19 @@ public class UnicodeSetStaticCache { unicodeSets.put(Key.STRICT_ALL_SEPARATORS, computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS)); - unicodeSets.put(Key.MINUS_SIGN, new UnicodeSet("[-⁻₋−➖﹣-]").freeze()); - unicodeSets.put(Key.PLUS_SIGN, new UnicodeSet("[+⁺₊➕﬩﹢+]").freeze()); + assert unicodeSets.containsKey(Key.MINUS_SIGN); + assert unicodeSets.containsKey(Key.PLUS_SIGN); unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze()); unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze()); unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze()); + assert unicodeSets.containsKey(Key.DOLLAR_SIGN); + assert unicodeSets.containsKey(Key.POUND_SIGN); + assert unicodeSets.containsKey(Key.RUPEE_SIGN); + unicodeSets.put(Key.YEN_SIGN, new UnicodeSet("[¥\\uffe5]").freeze()); + unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze()); - unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze()); unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS)); unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS, diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java b/icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java index a05cebd804d..03febbe1433 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/util/Currency.java @@ -14,7 +14,6 @@ import java.text.ParsePosition; import java.util.ArrayList; import java.util.Collections; import java.util.Date; -import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -31,10 +30,12 @@ import com.ibm.icu.impl.ICUResourceBundle; import com.ibm.icu.impl.SimpleCache; import com.ibm.icu.impl.SoftCache; import com.ibm.icu.impl.TextTrieMap; +import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache; import com.ibm.icu.text.CurrencyDisplayNames; import com.ibm.icu.text.CurrencyMetaInfo; import com.ibm.icu.text.CurrencyMetaInfo.CurrencyDigits; import com.ibm.icu.text.CurrencyMetaInfo.CurrencyFilter; +import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.util.ULocale.Category; /** @@ -98,13 +99,6 @@ public class Currency extends MeasureUnit { */ public static final int NARROW_SYMBOL_NAME = 3; - private static final EquivalenceRelation EQUIVALENT_CURRENCY_SYMBOLS = - new EquivalenceRelation() - .add("\u00a5", "\uffe5") - .add("$", "\ufe69", "\uff04") - .add("\u20a8", "\u20b9") - .add("\u00a3", "\u20a4"); - /** * Currency Usage used for Decimal Format * @stable ICU 54 @@ -778,8 +772,16 @@ public class Currency extends MeasureUnit { String isoCode = e.getValue(); // Register under not just symbol, but under every equivalent symbol as well // e.g short width yen and long width yen. - for (String equivalentSymbol : EQUIVALENT_CURRENCY_SYMBOLS.get(symbol)) { - symTrie.put(equivalentSymbol, new CurrencyStringInfo(isoCode, symbol)); + UnicodeSetStaticCache.Key key = UnicodeSetStaticCache.chooseCurrency(symbol); + CurrencyStringInfo value = new CurrencyStringInfo(isoCode, symbol); + if (key != null) { + UnicodeSet equivalents = UnicodeSetStaticCache.get(key); + // The symbol itself is included in the UnicodeSet + for (String equivalentSymbol : equivalents) { + symTrie.put(equivalentSymbol, value); + } + } else { + symTrie.put(symbol, value); } } for (Map.Entry e : names.nameMap().entrySet()) { @@ -1039,34 +1041,6 @@ public class Currency extends MeasureUnit { return info.currencies(filter.withTender()); } - private static final class EquivalenceRelation { - - private Map> data = new HashMap>(); - - @SuppressWarnings("unchecked") // See ticket #11395, this is safe. - public EquivalenceRelation add(T... items) { - Set group = new HashSet(); - for (T item : items) { - if (data.containsKey(item)) { - throw new IllegalArgumentException("All groups passed to add must be disjoint."); - } - group.add(item); - } - for (T item : items) { - data.put(item, group); - } - return this; - } - - public Set get(T item) { - Set result = data.get(item); - if (result == null) { - return Collections.singleton(item); - } - return Collections.unmodifiableSet(result); - } - } - private Object writeReplace() throws ObjectStreamException { return new MeasureUnitProxy(type, subType); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java index b442d3b3029..de6c3927779 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java @@ -1764,7 +1764,7 @@ public class NumberFormatTest extends TestFmwk { } // Test default ignorable characters. These should work in both lenient and strict. - UnicodeSet defaultIgnorables = new UnicodeSet("[[:Default_Ignorable_Code_Point:]]").freeze(); + UnicodeSet defaultIgnorables = new UnicodeSet("[[:Bidi_Control:]]").freeze(); fmt.setParseStrict(false); for (String ignorable : defaultIgnorables) { String str = "a b " + ignorable + "1234c ";