mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 06:25:30 +00:00
ICU-13697 Adding data-loading logic for parseLenients sets in CLDR. Ties the sets in with number and currency parsing in ICU4C and ICU4J.
X-SVN-Rev: 41223
This commit is contained in:
parent
af0f8e62e4
commit
354afa4e79
16 changed files with 390 additions and 554 deletions
|
@ -111,7 +111,8 @@ util.o util_props.o parsepos.o locbased.o cwchar.o wintz.o dtintrv.o ucnvsel.o p
|
|||
ulist.o uloc_tag.o icudataver.o icuplug.o listformatter.o ulistformatter.o \
|
||||
sharedobject.o simpleformatter.o unifiedcache.o uloc_keytype.o \
|
||||
ubiditransform.o \
|
||||
pluralmap.o
|
||||
pluralmap.o \
|
||||
numparse_unisets.o
|
||||
|
||||
## Header files to install
|
||||
HEADERS = $(srcdir)/unicode/*.h
|
||||
|
|
200
icu4c/source/common/numparse_unisets.cpp
Normal file
200
icu4c/source/common/numparse_unisets.cpp
Normal file
|
@ -0,0 +1,200 @@
|
|||
// © 2018 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
|
||||
|
||||
// Allow implicit conversion from char16_t* to UnicodeString for this file:
|
||||
// Helpful in toString methods and elsewhere.
|
||||
#define UNISTR_FROM_STRING_EXPLICIT
|
||||
|
||||
#include "numparse_unisets.h"
|
||||
#include "umutex.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "uresimp.h"
|
||||
#include "cstring.h"
|
||||
#include "uassert.h"
|
||||
|
||||
using namespace icu;
|
||||
using namespace icu::numparse;
|
||||
using namespace icu::numparse::impl;
|
||||
using namespace icu::numparse::impl::unisets;
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
static UnicodeSet* gUnicodeSets[COUNT] = {};
|
||||
|
||||
UnicodeSet* computeUnion(Key k1, Key k2) {
|
||||
UnicodeSet* result = new UnicodeSet();
|
||||
if (result == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
result->addAll(*gUnicodeSets[k1]);
|
||||
result->addAll(*gUnicodeSets[k2]);
|
||||
result->freeze();
|
||||
return result;
|
||||
}
|
||||
|
||||
UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
|
||||
UnicodeSet* result = new UnicodeSet();
|
||||
if (result == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
result->addAll(*gUnicodeSets[k1]);
|
||||
result->addAll(*gUnicodeSets[k2]);
|
||||
result->addAll(*gUnicodeSets[k3]);
|
||||
result->freeze();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
void saveSet(Key key, const UnicodeString& unicodeSetPattern, UErrorCode& status) {
|
||||
// assert unicodeSets.get(key) == null;
|
||||
gUnicodeSets[key] = new UnicodeSet(unicodeSetPattern, status);
|
||||
}
|
||||
|
||||
class ParseDataSink : public ResourceSink {
|
||||
public:
|
||||
void put(const char* key, ResourceValue& value, UBool /*noFallback*/, UErrorCode& status) U_OVERRIDE {
|
||||
ResourceTable contextsTable = value.getTable(status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
|
||||
if (uprv_strcmp(key, "date") == 0) {
|
||||
// ignore
|
||||
} else {
|
||||
ResourceTable strictnessTable = value.getTable(status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
|
||||
bool isLenient = (uprv_strcmp(key, "lenient") == 0);
|
||||
ResourceArray array = value.getArray(status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
for (int k = 0; k < array.getSize(); k++) {
|
||||
array.getValue(k, value);
|
||||
UnicodeString str = value.getUnicodeString(status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
// There is both lenient and strict data for comma/period,
|
||||
// but not for any of the other symbols.
|
||||
if (str.indexOf(u'.') != -1) {
|
||||
saveSet(isLenient ? PERIOD : STRICT_PERIOD, str, status);
|
||||
} else if (str.indexOf(u',') != -1) {
|
||||
saveSet(isLenient ? COMMA : STRICT_COMMA, str, status);
|
||||
} else if (str.indexOf(u'+') != -1) {
|
||||
saveSet(PLUS_SIGN, str, status);
|
||||
} else if (str.indexOf(u'‒') != -1) {
|
||||
saveSet(MINUS_SIGN, str, status);
|
||||
} else if (str.indexOf(u'$') != -1) {
|
||||
saveSet(DOLLAR_SIGN, str, status);
|
||||
} else if (str.indexOf(u'£') != -1) {
|
||||
saveSet(POUND_SIGN, str, status);
|
||||
} else if (str.indexOf(u'₨') != -1) {
|
||||
saveSet(RUPEE_SIGN, str, status);
|
||||
}
|
||||
if (U_FAILURE(status)) { return; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
|
||||
|
||||
UBool U_CALLCONV cleanupNumberParseUniSets() {
|
||||
for (int32_t i = 0; i < COUNT; i++) {
|
||||
delete gUnicodeSets[i];
|
||||
gUnicodeSets[i] = nullptr;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
|
||||
ucln_common_registerCleanup(UCLN_COMMON_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
|
||||
|
||||
gUnicodeSets[EMPTY] = new UnicodeSet();
|
||||
|
||||
// These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
|
||||
// Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
|
||||
gUnicodeSets[DEFAULT_IGNORABLES] = new UnicodeSet(
|
||||
u"[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]", status);
|
||||
gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(u"[[:Bidi_Control:]]", status);
|
||||
|
||||
LocalUResourceBundlePointer rb(ures_open(nullptr, "root", &status));
|
||||
if (U_FAILURE(status)) { return; }
|
||||
ParseDataSink sink;
|
||||
ures_getAllItemsWithFallback(rb.getAlias(), "parse", sink, status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
|
||||
// TODO: Should there be fallback behavior if for some reason these sets didn't get populated?
|
||||
U_ASSERT(gUnicodeSets[COMMA] != nullptr);
|
||||
U_ASSERT(gUnicodeSets[STRICT_COMMA] != nullptr);
|
||||
U_ASSERT(gUnicodeSets[PERIOD] != nullptr);
|
||||
U_ASSERT(gUnicodeSets[STRICT_PERIOD] != nullptr);
|
||||
|
||||
gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet(
|
||||
u"['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
|
||||
gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
|
||||
gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
|
||||
STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
|
||||
|
||||
U_ASSERT(gUnicodeSets[MINUS_SIGN] != nullptr);
|
||||
U_ASSERT(gUnicodeSets[PLUS_SIGN] != nullptr);
|
||||
|
||||
gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status);
|
||||
gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status);
|
||||
gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
|
||||
|
||||
U_ASSERT(gUnicodeSets[DOLLAR_SIGN] != nullptr);
|
||||
U_ASSERT(gUnicodeSets[POUND_SIGN] != nullptr);
|
||||
U_ASSERT(gUnicodeSets[RUPEE_SIGN] != nullptr);
|
||||
gUnicodeSets[YEN_SIGN] = new UnicodeSet(u"[¥\\uffe5]", status);
|
||||
|
||||
gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
|
||||
|
||||
gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
|
||||
gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
|
||||
|
||||
for (int32_t i = 0; i < COUNT; i++) {
|
||||
gUnicodeSets[i]->freeze();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const UnicodeSet* unisets::get(Key key) {
|
||||
UErrorCode localStatus = U_ZERO_ERROR;
|
||||
umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
|
||||
if (U_FAILURE(localStatus)) {
|
||||
// TODO: This returns non-null in Java, and callers assume that.
|
||||
return nullptr;
|
||||
}
|
||||
return gUnicodeSets[key];
|
||||
}
|
||||
|
||||
Key unisets::chooseFrom(UnicodeString str, Key key1) {
|
||||
return get(key1)->contains(str) ? key1 : COUNT;
|
||||
}
|
||||
|
||||
Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
|
||||
return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
|
||||
}
|
||||
|
||||
//Key unisets::chooseCurrency(UnicodeString str) {
|
||||
// if (get(DOLLAR_SIGN)->contains(str)) {
|
||||
// return DOLLAR_SIGN;
|
||||
// } else if (get(POUND_SIGN)->contains(str)) {
|
||||
// return POUND_SIGN;
|
||||
// } else if (get(RUPEE_SIGN)->contains(str)) {
|
||||
// return RUPEE_SIGN;
|
||||
// } else if (get(YEN_SIGN)->contains(str)) {
|
||||
// return YEN_SIGN;
|
||||
// } else {
|
||||
// return COUNT;
|
||||
// }
|
||||
//}
|
||||
|
||||
|
||||
#endif /* #if !UCONFIG_NO_FORMATTING */
|
|
@ -1,14 +1,16 @@
|
|||
// © 2018 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// This file is in common instead of i18n because it is needed by ucurr.cpp.
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
|
||||
#ifndef __NUMPARSE_UNISETS_H__
|
||||
#define __NUMPARSE_UNISETS_H__
|
||||
|
||||
#include "numparse_types.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
U_NAMESPACE_BEGIN namespace numparse {
|
||||
namespace impl {
|
||||
|
@ -18,8 +20,6 @@ enum Key {
|
|||
EMPTY,
|
||||
|
||||
// Ignorables
|
||||
BIDI,
|
||||
WHITESPACE,
|
||||
DEFAULT_IGNORABLES,
|
||||
STRICT_IGNORABLES,
|
||||
|
||||
|
@ -29,7 +29,7 @@ enum Key {
|
|||
// - PERIOD is a superset of SCRICT_PERIOD
|
||||
// - ALL_SEPARATORS is the union of COMMA, PERIOD, and OTHER_GROUPING_SEPARATORS
|
||||
// - STRICT_ALL_SEPARATORS is the union of STRICT_COMMA, STRICT_PERIOD, and OTHER_GRP_SEPARATORS
|
||||
COMMA,
|
||||
COMMA,
|
||||
PERIOD,
|
||||
STRICT_COMMA,
|
||||
STRICT_PERIOD,
|
||||
|
@ -38,23 +38,27 @@ enum Key {
|
|||
STRICT_ALL_SEPARATORS,
|
||||
|
||||
// Symbols
|
||||
// TODO: NaN?
|
||||
MINUS_SIGN,
|
||||
MINUS_SIGN,
|
||||
PLUS_SIGN,
|
||||
PERCENT_SIGN,
|
||||
PERMILLE_SIGN,
|
||||
INFINITY_KEY, // INFINITY is defined in cmath
|
||||
|
||||
// Currency Symbols
|
||||
DOLLAR_SIGN,
|
||||
POUND_SIGN,
|
||||
RUPEE_SIGN,
|
||||
YEN_SIGN, // not in CLDR data, but Currency.java wants it
|
||||
|
||||
// Other
|
||||
DIGITS,
|
||||
CWCF,
|
||||
DIGITS,
|
||||
|
||||
// Combined Separators with Digits (for lead code points)
|
||||
DIGITS_OR_ALL_SEPARATORS,
|
||||
DIGITS_OR_ALL_SEPARATORS,
|
||||
DIGITS_OR_STRICT_ALL_SEPARATORS,
|
||||
|
||||
// The number of elements in the enum. Also used to indicate null.
|
||||
COUNT
|
||||
COUNT
|
||||
};
|
||||
|
||||
const UnicodeSet* get(Key key);
|
||||
|
@ -63,6 +67,19 @@ Key chooseFrom(UnicodeString str, Key key1);
|
|||
|
||||
Key chooseFrom(UnicodeString str, Key key1, Key key2);
|
||||
|
||||
// Unused in C++:
|
||||
// Key chooseCurrency(UnicodeString str);
|
||||
// Used instead:
|
||||
static const struct {
|
||||
Key key;
|
||||
UChar32 exemplar;
|
||||
} kCurrencyEntries[] = {
|
||||
{DOLLAR_SIGN, u'$'},
|
||||
{POUND_SIGN, u'£'},
|
||||
{RUPEE_SIGN, u'₨'},
|
||||
{YEN_SIGN, u'¥'},
|
||||
};
|
||||
|
||||
} // namespace unisets
|
||||
} // namespace impl
|
||||
} // namespace numparse
|
|
@ -33,6 +33,7 @@ Please keep the order of enums declared in same order
|
|||
as the cleanup functions are suppose to be called. */
|
||||
typedef enum ECleanupCommonType {
|
||||
UCLN_COMMON_START = -1,
|
||||
UCLN_COMMON_NUMPARSE_UNISETS,
|
||||
UCLN_COMMON_USPREP,
|
||||
UCLN_COMMON_BREAKITERATOR,
|
||||
UCLN_COMMON_RBBI,
|
||||
|
|
|
@ -17,11 +17,13 @@
|
|||
#include "unicode/ustring.h"
|
||||
#include "unicode/parsepos.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/usetiter.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "numparse_unisets.h"
|
||||
#include "uassert.h"
|
||||
#include "umutex.h"
|
||||
#include "ucln_cmn.h"
|
||||
|
@ -67,14 +69,6 @@ static const int32_t POW10[] = { 1, 10, 100, 1000, 10000, 100000,
|
|||
|
||||
static const int32_t MAX_POW10 = UPRV_LENGTHOF(POW10) - 1;
|
||||
|
||||
// Defines equivalent currency symbols.
|
||||
static const char *EQUIV_CURRENCY_SYMBOLS[][2] = {
|
||||
{"\\u00a5", "\\uffe5"},
|
||||
{"$", "\\ufe69"},
|
||||
{"$", "\\uff04"},
|
||||
{"\\u20a8", "\\u20b9"},
|
||||
{"\\u00a3", "\\u20a4"}};
|
||||
|
||||
#define ISO_CURRENCY_CODE_LENGTH 3
|
||||
|
||||
//------------------------------------------------------------
|
||||
|
@ -2207,16 +2201,21 @@ static void U_CALLCONV initIsoCodes(UErrorCode &status) {
|
|||
}
|
||||
|
||||
static void populateCurrSymbolsEquiv(icu::Hashtable *hash, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int32_t length = UPRV_LENGTHOF(EQUIV_CURRENCY_SYMBOLS);
|
||||
for (int32_t i = 0; i < length; ++i) {
|
||||
icu::UnicodeString lhs(EQUIV_CURRENCY_SYMBOLS[i][0], -1, US_INV);
|
||||
icu::UnicodeString rhs(EQUIV_CURRENCY_SYMBOLS[i][1], -1, US_INV);
|
||||
makeEquivalent(lhs.unescape(), rhs.unescape(), hash, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
using namespace icu::numparse::impl;
|
||||
if (U_FAILURE(status)) { return; }
|
||||
for (auto& entry : unisets::kCurrencyEntries) {
|
||||
UnicodeString exemplar(entry.exemplar);
|
||||
const UnicodeSet* set = unisets::get(entry.key);
|
||||
if (set == nullptr) { return; }
|
||||
UnicodeSetIterator it(*set);
|
||||
while (it.next()) {
|
||||
UnicodeString value = it.getString();
|
||||
if (value == exemplar) {
|
||||
// No need to mark the exemplar character as an equivalent
|
||||
continue;
|
||||
}
|
||||
makeEquivalent(exemplar, value, hash, status);
|
||||
if (U_FAILURE(status)) { return; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,7 +92,7 @@ csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.
|
|||
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o zonemeta.o \
|
||||
standardplural.o upluralrules.o plurrule.o plurfmt.o selfmt.o dtitvfmt.o dtitvinf.o udateintervalformat.o \
|
||||
tmunit.o tmutamt.o tmutfmt.o currpinf.o \
|
||||
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o decfmtst.o smpdtfst.o \
|
||||
uspoof.o uspoof_impl.o uspoof_build.o uspoof_conf.o smpdtfst.o \
|
||||
ztrans.o zrule.o vzone.o fphdlimp.o fpositer.o ufieldpositer.o \
|
||||
decNumber.o decContext.o alphaindex.o tznames.o tznames_impl.o tzgnames.o \
|
||||
tzfmt.o compactdecimalformat.o gender.o region.o scriptset.o \
|
||||
|
@ -107,7 +107,7 @@ number_mapper.o number_multiplier.o number_currencysymbols.o number_skeletons.o
|
|||
double-conversion.o double-conversion-bignum-dtoa.o double-conversion-bignum.o \
|
||||
double-conversion-cached-powers.o double-conversion-diy-fp.o \
|
||||
double-conversion-fast-dtoa.o double-conversion-strtod.o \
|
||||
numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o numparse_impl.o \
|
||||
numparse_stringsegment.o numparse_parsednumber.o numparse_impl.o \
|
||||
numparse_symbols.o numparse_decimal.o numparse_scientific.o numparse_currency.o \
|
||||
numparse_affixes.o numparse_compositions.o numparse_validators.o \
|
||||
|
||||
|
|
|
@ -1,251 +0,0 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2016, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*
|
||||
* This file contains the class DecimalFormatStaticSets
|
||||
*
|
||||
* DecimalFormatStaticSets holds the UnicodeSets that are needed for lenient
|
||||
* parsing of decimal and group separators.
|
||||
********************************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_FORMATTING
|
||||
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "uassert.h"
|
||||
#include "ucln_in.h"
|
||||
#include "umutex.h"
|
||||
|
||||
#include "decfmtst.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// Unicode Set pattern strings for all of the required constant sets.
|
||||
// Initialized with hex values for portability to EBCDIC based machines.
|
||||
// Really ugly, but there's no good way to avoid it.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
static const UChar gDotEquivalentsPattern[] = {
|
||||
// [ . \u2024 \u3002 \uFE12 \uFE52 \uFF0E \uFF61 ]
|
||||
0x005B, 0x002E, 0x2024, 0x3002, 0xFE12, 0xFE52, 0xFF0E, 0xFF61, 0x005D, 0x0000};
|
||||
|
||||
static const UChar gCommaEquivalentsPattern[] = {
|
||||
// [ , \u060C \u066B \u3001 \uFE10 \uFE11 \uFE50 \uFE51 \uFF0C \uFF64 ]
|
||||
0x005B, 0x002C, 0x060C, 0x066B, 0x3001, 0xFE10, 0xFE11, 0xFE50, 0xFE51, 0xFF0C, 0xFF64, 0x005D, 0x0000};
|
||||
|
||||
static const UChar gOtherGroupingSeparatorsPattern[] = {
|
||||
// [ \ SPACE ' NBSP \u066C \u2000 - \u200A \u2018 \u2019 \u202F \u205F \u3000 \uFF07 ]
|
||||
0x005B, 0x005C, 0x0020, 0x0027, 0x00A0, 0x066C, 0x2000, 0x002D, 0x200A, 0x2018, 0x2019, 0x202F, 0x205F, 0x3000, 0xFF07, 0x005D, 0x0000};
|
||||
|
||||
static const UChar gDashEquivalentsPattern[] = {
|
||||
// [ \ - HYPHEN F_DASH N_DASH MINUS ]
|
||||
0x005B, 0x005C, 0x002D, 0x2010, 0x2012, 0x2013, 0x2212, 0x005D, 0x0000};
|
||||
|
||||
static const UChar gStrictDotEquivalentsPattern[] = {
|
||||
// [ . \u2024 \uFE52 \uFF0E \uFF61 ]
|
||||
0x005B, 0x002E, 0x2024, 0xFE52, 0xFF0E, 0xFF61, 0x005D, 0x0000};
|
||||
|
||||
static const UChar gStrictCommaEquivalentsPattern[] = {
|
||||
// [ , \u066B \uFE10 \uFE50 \uFF0C ]
|
||||
0x005B, 0x002C, 0x066B, 0xFE10, 0xFE50, 0xFF0C, 0x005D, 0x0000};
|
||||
|
||||
static const UChar gStrictOtherGroupingSeparatorsPattern[] = {
|
||||
// [ \ SPACE ' NBSP \u066C \u2000 - \u200A \u2018 \u2019 \u202F \u205F \u3000 \uFF07 ]
|
||||
0x005B, 0x005C, 0x0020, 0x0027, 0x00A0, 0x066C, 0x2000, 0x002D, 0x200A, 0x2018, 0x2019, 0x202F, 0x205F, 0x3000, 0xFF07, 0x005D, 0x0000};
|
||||
|
||||
static const UChar gStrictDashEquivalentsPattern[] = {
|
||||
// [ \ - MINUS ]
|
||||
0x005B, 0x005C, 0x002D, 0x2212, 0x005D, 0x0000};
|
||||
|
||||
static const UChar32 gMinusSigns[] = {
|
||||
0x002D,
|
||||
0x207B,
|
||||
0x208B,
|
||||
0x2212,
|
||||
0x2796,
|
||||
0xFE63,
|
||||
0xFF0D};
|
||||
|
||||
static const UChar32 gPlusSigns[] = {
|
||||
0x002B,
|
||||
0x207A,
|
||||
0x208A,
|
||||
0x2795,
|
||||
0xfB29,
|
||||
0xFE62,
|
||||
0xFF0B};
|
||||
|
||||
static void initUnicodeSet(const UChar32 *raw, int32_t len, UnicodeSet *s) {
|
||||
for (int32_t i = 0; i < len; ++i) {
|
||||
s->add(raw[i]);
|
||||
}
|
||||
}
|
||||
|
||||
DecimalFormatStaticSets::DecimalFormatStaticSets(UErrorCode &status)
|
||||
: fDotEquivalents(NULL),
|
||||
fCommaEquivalents(NULL),
|
||||
fOtherGroupingSeparators(NULL),
|
||||
fDashEquivalents(NULL),
|
||||
fStrictDotEquivalents(NULL),
|
||||
fStrictCommaEquivalents(NULL),
|
||||
fStrictOtherGroupingSeparators(NULL),
|
||||
fStrictDashEquivalents(NULL),
|
||||
fDefaultGroupingSeparators(NULL),
|
||||
fStrictDefaultGroupingSeparators(NULL),
|
||||
fMinusSigns(NULL),
|
||||
fPlusSigns(NULL)
|
||||
{
|
||||
fDotEquivalents = new UnicodeSet(UnicodeString(TRUE, gDotEquivalentsPattern, -1), status);
|
||||
fCommaEquivalents = new UnicodeSet(UnicodeString(TRUE, gCommaEquivalentsPattern, -1), status);
|
||||
fOtherGroupingSeparators = new UnicodeSet(UnicodeString(TRUE, gOtherGroupingSeparatorsPattern, -1), status);
|
||||
fDashEquivalents = new UnicodeSet(UnicodeString(TRUE, gDashEquivalentsPattern, -1), status);
|
||||
|
||||
fStrictDotEquivalents = new UnicodeSet(UnicodeString(TRUE, gStrictDotEquivalentsPattern, -1), status);
|
||||
fStrictCommaEquivalents = new UnicodeSet(UnicodeString(TRUE, gStrictCommaEquivalentsPattern, -1), status);
|
||||
fStrictOtherGroupingSeparators = new UnicodeSet(UnicodeString(TRUE, gStrictOtherGroupingSeparatorsPattern, -1), status);
|
||||
fStrictDashEquivalents = new UnicodeSet(UnicodeString(TRUE, gStrictDashEquivalentsPattern, -1), status);
|
||||
|
||||
|
||||
fDefaultGroupingSeparators = new UnicodeSet(*fDotEquivalents);
|
||||
fDefaultGroupingSeparators->addAll(*fCommaEquivalents);
|
||||
fDefaultGroupingSeparators->addAll(*fOtherGroupingSeparators);
|
||||
|
||||
fStrictDefaultGroupingSeparators = new UnicodeSet(*fStrictDotEquivalents);
|
||||
fStrictDefaultGroupingSeparators->addAll(*fStrictCommaEquivalents);
|
||||
fStrictDefaultGroupingSeparators->addAll(*fStrictOtherGroupingSeparators);
|
||||
|
||||
fMinusSigns = new UnicodeSet();
|
||||
fPlusSigns = new UnicodeSet();
|
||||
|
||||
// Check for null pointers
|
||||
if (fDotEquivalents == NULL || fCommaEquivalents == NULL || fOtherGroupingSeparators == NULL || fDashEquivalents == NULL ||
|
||||
fStrictDotEquivalents == NULL || fStrictCommaEquivalents == NULL || fStrictOtherGroupingSeparators == NULL || fStrictDashEquivalents == NULL ||
|
||||
fDefaultGroupingSeparators == NULL || fStrictOtherGroupingSeparators == NULL ||
|
||||
fMinusSigns == NULL || fPlusSigns == NULL) {
|
||||
cleanup();
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
initUnicodeSet(
|
||||
gMinusSigns,
|
||||
UPRV_LENGTHOF(gMinusSigns),
|
||||
fMinusSigns);
|
||||
initUnicodeSet(
|
||||
gPlusSigns,
|
||||
UPRV_LENGTHOF(gPlusSigns),
|
||||
fPlusSigns);
|
||||
|
||||
// Freeze all the sets
|
||||
fDotEquivalents->freeze();
|
||||
fCommaEquivalents->freeze();
|
||||
fOtherGroupingSeparators->freeze();
|
||||
fDashEquivalents->freeze();
|
||||
fStrictDotEquivalents->freeze();
|
||||
fStrictCommaEquivalents->freeze();
|
||||
fStrictOtherGroupingSeparators->freeze();
|
||||
fStrictDashEquivalents->freeze();
|
||||
fDefaultGroupingSeparators->freeze();
|
||||
fStrictDefaultGroupingSeparators->freeze();
|
||||
fMinusSigns->freeze();
|
||||
fPlusSigns->freeze();
|
||||
}
|
||||
|
||||
DecimalFormatStaticSets::~DecimalFormatStaticSets() {
|
||||
cleanup();
|
||||
}
|
||||
|
||||
void DecimalFormatStaticSets::cleanup() { // Be sure to clean up newly added fields!
|
||||
delete fDotEquivalents; fDotEquivalents = NULL;
|
||||
delete fCommaEquivalents; fCommaEquivalents = NULL;
|
||||
delete fOtherGroupingSeparators; fOtherGroupingSeparators = NULL;
|
||||
delete fDashEquivalents; fDashEquivalents = NULL;
|
||||
delete fStrictDotEquivalents; fStrictDotEquivalents = NULL;
|
||||
delete fStrictCommaEquivalents; fStrictCommaEquivalents = NULL;
|
||||
delete fStrictOtherGroupingSeparators; fStrictOtherGroupingSeparators = NULL;
|
||||
delete fStrictDashEquivalents; fStrictDashEquivalents = NULL;
|
||||
delete fDefaultGroupingSeparators; fDefaultGroupingSeparators = NULL;
|
||||
delete fStrictDefaultGroupingSeparators; fStrictDefaultGroupingSeparators = NULL;
|
||||
delete fStrictOtherGroupingSeparators; fStrictOtherGroupingSeparators = NULL;
|
||||
delete fMinusSigns; fMinusSigns = NULL;
|
||||
delete fPlusSigns; fPlusSigns = NULL;
|
||||
}
|
||||
|
||||
static DecimalFormatStaticSets *gStaticSets;
|
||||
static icu::UInitOnce gStaticSetsInitOnce = U_INITONCE_INITIALIZER;
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
//
|
||||
// decfmt_cleanup Memory cleanup function, free/delete all
|
||||
// cached memory. Called by ICU's u_cleanup() function.
|
||||
//
|
||||
//------------------------------------------------------------------------------
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
decimfmt_cleanup(void)
|
||||
{
|
||||
delete gStaticSets;
|
||||
gStaticSets = NULL;
|
||||
gStaticSetsInitOnce.reset();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void U_CALLCONV initSets(UErrorCode &status) {
|
||||
U_ASSERT(gStaticSets == NULL);
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_DECFMT, decimfmt_cleanup);
|
||||
gStaticSets = new DecimalFormatStaticSets(status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete gStaticSets;
|
||||
gStaticSets = NULL;
|
||||
return;
|
||||
}
|
||||
if (gStaticSets == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
const DecimalFormatStaticSets *DecimalFormatStaticSets::getStaticSets(UErrorCode &status) {
|
||||
umtx_initOnce(gStaticSetsInitOnce, initSets, status);
|
||||
return gStaticSets;
|
||||
}
|
||||
|
||||
|
||||
const UnicodeSet *DecimalFormatStaticSets::getSimilarDecimals(UChar32 decimal, UBool strictParse)
|
||||
{
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
umtx_initOnce(gStaticSetsInitOnce, initSets, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (gStaticSets->fDotEquivalents->contains(decimal)) {
|
||||
return strictParse ? gStaticSets->fStrictDotEquivalents : gStaticSets->fDotEquivalents;
|
||||
}
|
||||
|
||||
if (gStaticSets->fCommaEquivalents->contains(decimal)) {
|
||||
return strictParse ? gStaticSets->fStrictCommaEquivalents : gStaticSets->fCommaEquivalents;
|
||||
}
|
||||
|
||||
// if there is no match, return NULL
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
#endif // !UCONFIG_NO_FORMATTING
|
|
@ -1,69 +0,0 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2016, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*
|
||||
* This file contains declarations for the class DecimalFormatStaticSets
|
||||
*
|
||||
* DecimalFormatStaticSets holds the UnicodeSets that are needed for lenient
|
||||
* parsing of decimal and group separators.
|
||||
********************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef DECFMTST_H
|
||||
#define DECFMTST_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_FORMATTING
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UnicodeSet;
|
||||
|
||||
|
||||
class DecimalFormatStaticSets : public UMemory
|
||||
{
|
||||
public:
|
||||
// Constructor and Destructor not for general use.
|
||||
// Public to permit access from plain C implementation functions.
|
||||
DecimalFormatStaticSets(UErrorCode &status);
|
||||
~DecimalFormatStaticSets();
|
||||
|
||||
/**
|
||||
* Return a pointer to a lazy-initialized singleton instance of this class.
|
||||
*/
|
||||
static const DecimalFormatStaticSets *getStaticSets(UErrorCode &status);
|
||||
|
||||
static const UnicodeSet *getSimilarDecimals(UChar32 decimal, UBool strictParse);
|
||||
|
||||
UnicodeSet *fDotEquivalents;
|
||||
UnicodeSet *fCommaEquivalents;
|
||||
UnicodeSet *fOtherGroupingSeparators;
|
||||
UnicodeSet *fDashEquivalents;
|
||||
|
||||
UnicodeSet *fStrictDotEquivalents;
|
||||
UnicodeSet *fStrictCommaEquivalents;
|
||||
UnicodeSet *fStrictOtherGroupingSeparators;
|
||||
UnicodeSet *fStrictDashEquivalents;
|
||||
|
||||
UnicodeSet *fDefaultGroupingSeparators;
|
||||
UnicodeSet *fStrictDefaultGroupingSeparators;
|
||||
|
||||
UnicodeSet *fMinusSigns;
|
||||
UnicodeSet *fPlusSigns;
|
||||
private:
|
||||
void cleanup();
|
||||
|
||||
};
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_FORMATTING
|
||||
#endif // DECFMTST_H
|
|
@ -1,127 +0,0 @@
|
|||
// © 2018 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
|
||||
|
||||
// Allow implicit conversion from char16_t* to UnicodeString for this file:
|
||||
// Helpful in toString methods and elsewhere.
|
||||
#define UNISTR_FROM_STRING_EXPLICIT
|
||||
|
||||
#include "numparse_unisets.h"
|
||||
#include "numparse_types.h"
|
||||
#include "umutex.h"
|
||||
#include "ucln_in.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
using namespace icu;
|
||||
using namespace icu::numparse;
|
||||
using namespace icu::numparse::impl;
|
||||
using namespace icu::numparse::impl::unisets;
|
||||
|
||||
|
||||
namespace {
|
||||
|
||||
static UnicodeSet* gUnicodeSets[COUNT] = {};
|
||||
|
||||
UnicodeSet* computeUnion(Key k1, Key k2) {
|
||||
UnicodeSet* result = new UnicodeSet();
|
||||
if (result == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
result->addAll(*gUnicodeSets[k1]);
|
||||
result->addAll(*gUnicodeSets[k2]);
|
||||
result->freeze();
|
||||
return result;
|
||||
}
|
||||
|
||||
UnicodeSet* computeUnion(Key k1, Key k2, Key k3) {
|
||||
UnicodeSet* result = new UnicodeSet();
|
||||
if (result == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
result->addAll(*gUnicodeSets[k1]);
|
||||
result->addAll(*gUnicodeSets[k2]);
|
||||
result->addAll(*gUnicodeSets[k3]);
|
||||
result->freeze();
|
||||
return result;
|
||||
}
|
||||
|
||||
icu::UInitOnce gNumberParseUniSetsInitOnce = U_INITONCE_INITIALIZER;
|
||||
|
||||
UBool U_CALLCONV cleanupNumberParseUniSets() {
|
||||
for (int32_t i = 0; i < COUNT; i++) {
|
||||
delete gUnicodeSets[i];
|
||||
gUnicodeSets[i] = nullptr;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_NUMPARSE_UNISETS, cleanupNumberParseUniSets);
|
||||
|
||||
gUnicodeSets[EMPTY] = new UnicodeSet();
|
||||
|
||||
// These characters are skipped over and ignored at any point in the string, even in strict mode.
|
||||
// See ticket #13084.
|
||||
gUnicodeSets[BIDI] = new UnicodeSet(u"[[:DI:]]", status);
|
||||
|
||||
// This set was decided after discussion with icu-design@. See ticket #13309.
|
||||
// Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
|
||||
gUnicodeSets[WHITESPACE] = new UnicodeSet(u"[[:Zs:][\\u0009]]", status);
|
||||
|
||||
gUnicodeSets[DEFAULT_IGNORABLES] = computeUnion(BIDI, WHITESPACE);
|
||||
gUnicodeSets[STRICT_IGNORABLES] = new UnicodeSet(*gUnicodeSets[BIDI]);
|
||||
|
||||
// TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while.
|
||||
gUnicodeSets[COMMA] = new UnicodeSet(u"[,،٫、︐︑﹐﹑,、]", status);
|
||||
gUnicodeSets[STRICT_COMMA] = new UnicodeSet(u"[,٫︐﹐,]", status);
|
||||
gUnicodeSets[PERIOD] = new UnicodeSet(u"[.․。︒﹒.。]", status);
|
||||
gUnicodeSets[STRICT_PERIOD] = new UnicodeSet(u"[.․﹒.。]", status);
|
||||
gUnicodeSets[OTHER_GROUPING_SEPARATORS] = new UnicodeSet(
|
||||
u"['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]", status);
|
||||
gUnicodeSets[ALL_SEPARATORS] = computeUnion(COMMA, PERIOD, OTHER_GROUPING_SEPARATORS);
|
||||
gUnicodeSets[STRICT_ALL_SEPARATORS] = computeUnion(
|
||||
STRICT_COMMA, STRICT_PERIOD, OTHER_GROUPING_SEPARATORS);
|
||||
|
||||
gUnicodeSets[MINUS_SIGN] = new UnicodeSet(u"[-⁻₋−➖﹣-]", status);
|
||||
gUnicodeSets[PLUS_SIGN] = new UnicodeSet(u"[+⁺₊➕﬩﹢+]", status);
|
||||
|
||||
gUnicodeSets[PERCENT_SIGN] = new UnicodeSet(u"[%٪]", status);
|
||||
gUnicodeSets[PERMILLE_SIGN] = new UnicodeSet(u"[‰؉]", status);
|
||||
gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
|
||||
|
||||
gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
|
||||
gUnicodeSets[CWCF] = new UnicodeSet(u"[:CWCF:]", status);
|
||||
|
||||
gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
|
||||
gUnicodeSets[DIGITS_OR_STRICT_ALL_SEPARATORS] = computeUnion(DIGITS, STRICT_ALL_SEPARATORS);
|
||||
|
||||
for (int32_t i = 0; i < COUNT; i++) {
|
||||
gUnicodeSets[i]->freeze();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const UnicodeSet* unisets::get(Key key) {
|
||||
UErrorCode localStatus = U_ZERO_ERROR;
|
||||
umtx_initOnce(gNumberParseUniSetsInitOnce, &initNumberParseUniSets, localStatus);
|
||||
if (U_FAILURE(localStatus)) {
|
||||
// TODO: This returns non-null in Java, and callers assume that.
|
||||
return nullptr;
|
||||
}
|
||||
return gUnicodeSets[key];
|
||||
}
|
||||
|
||||
Key unisets::chooseFrom(UnicodeString str, Key key1) {
|
||||
return get(key1)->contains(str) ? key1 : COUNT;
|
||||
}
|
||||
|
||||
Key unisets::chooseFrom(UnicodeString str, Key key1, Key key2) {
|
||||
return get(key1)->contains(str) ? key1 : chooseFrom(str, key2);
|
||||
}
|
||||
|
||||
|
||||
#endif /* #if !UCONFIG_NO_FORMATTING */
|
|
@ -15,8 +15,8 @@
|
|||
#include "unicode/fpositer.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "decfmtst.h"
|
||||
#include "unicode/decimfmt.h"
|
||||
#include "numparse_unisets.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -129,7 +129,6 @@ UnicodeString &ScientificNumberFormatter::SuperscriptStyle::format(
|
|||
const UnicodeString &original,
|
||||
FieldPositionIterator &fpi,
|
||||
const UnicodeString &preExponent,
|
||||
const DecimalFormatStaticSets &staticSets,
|
||||
UnicodeString &appendTo,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -149,16 +148,17 @@ UnicodeString &ScientificNumberFormatter::SuperscriptStyle::format(
|
|||
break;
|
||||
case UNUM_EXPONENT_SIGN_FIELD:
|
||||
{
|
||||
using namespace icu::numparse::impl;
|
||||
int32_t beginIndex = fp.getBeginIndex();
|
||||
int32_t endIndex = fp.getEndIndex();
|
||||
UChar32 aChar = original.char32At(beginIndex);
|
||||
if (staticSets.fMinusSigns->contains(aChar)) {
|
||||
if (unisets::get(unisets::MINUS_SIGN)->contains(aChar)) {
|
||||
appendTo.append(
|
||||
original,
|
||||
copyFromOffset,
|
||||
beginIndex - copyFromOffset);
|
||||
appendTo.append(kSuperscriptMinusSign);
|
||||
} else if (staticSets.fPlusSigns->contains(aChar)) {
|
||||
} else if (unisets::get(unisets::PLUS_SIGN)->contains(aChar)) {
|
||||
appendTo.append(
|
||||
original,
|
||||
copyFromOffset,
|
||||
|
@ -203,7 +203,6 @@ UnicodeString &ScientificNumberFormatter::MarkupStyle::format(
|
|||
const UnicodeString &original,
|
||||
FieldPositionIterator &fpi,
|
||||
const UnicodeString &preExponent,
|
||||
const DecimalFormatStaticSets & /*unusedDecimalFormatSets*/,
|
||||
UnicodeString &appendTo,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
|
@ -243,8 +242,7 @@ ScientificNumberFormatter::ScientificNumberFormatter(
|
|||
DecimalFormat *fmtToAdopt, Style *styleToAdopt, UErrorCode &status)
|
||||
: fPreExponent(),
|
||||
fDecimalFormat(fmtToAdopt),
|
||||
fStyle(styleToAdopt),
|
||||
fStaticSets(NULL) {
|
||||
fStyle(styleToAdopt) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
|
@ -258,7 +256,6 @@ ScientificNumberFormatter::ScientificNumberFormatter(
|
|||
return;
|
||||
}
|
||||
getPreExponent(*sym, fPreExponent);
|
||||
fStaticSets = DecimalFormatStaticSets::getStaticSets(status);
|
||||
}
|
||||
|
||||
ScientificNumberFormatter::ScientificNumberFormatter(
|
||||
|
@ -266,8 +263,7 @@ ScientificNumberFormatter::ScientificNumberFormatter(
|
|||
: UObject(other),
|
||||
fPreExponent(other.fPreExponent),
|
||||
fDecimalFormat(NULL),
|
||||
fStyle(NULL),
|
||||
fStaticSets(other.fStaticSets) {
|
||||
fStyle(NULL) {
|
||||
fDecimalFormat = static_cast<DecimalFormat *>(
|
||||
other.fDecimalFormat->clone());
|
||||
fStyle = other.fStyle->clone();
|
||||
|
@ -292,7 +288,6 @@ UnicodeString &ScientificNumberFormatter::format(
|
|||
original,
|
||||
fpi,
|
||||
fPreExponent,
|
||||
*fStaticSets,
|
||||
appendTo,
|
||||
status);
|
||||
}
|
||||
|
|
|
@ -27,7 +27,6 @@ It's usually best to have child dependencies called first. */
|
|||
typedef enum ECleanupI18NType {
|
||||
UCLN_I18N_START = -1,
|
||||
UCLN_I18N_NUMBER_SKELETONS,
|
||||
UCLN_I18N_NUMPARSE_UNISETS,
|
||||
UCLN_I18N_CURRENCY_SPACING,
|
||||
UCLN_I18N_SPOOF,
|
||||
UCLN_I18N_SPOOFDATA,
|
||||
|
|
|
@ -24,7 +24,6 @@
|
|||
U_NAMESPACE_BEGIN
|
||||
|
||||
class FieldPositionIterator;
|
||||
class DecimalFormatStaticSets;
|
||||
class DecimalFormatSymbols;
|
||||
class DecimalFormat;
|
||||
class Formattable;
|
||||
|
@ -150,7 +149,6 @@ public:
|
|||
const UnicodeString &original,
|
||||
FieldPositionIterator &fpi,
|
||||
const UnicodeString &preExponent,
|
||||
const DecimalFormatStaticSets &decimalFormatSets,
|
||||
UnicodeString &appendTo,
|
||||
UErrorCode &status) const = 0;
|
||||
private:
|
||||
|
@ -165,7 +163,6 @@ public:
|
|||
const UnicodeString &original,
|
||||
FieldPositionIterator &fpi,
|
||||
const UnicodeString &preExponent,
|
||||
const DecimalFormatStaticSets &decimalFormatSets,
|
||||
UnicodeString &appendTo,
|
||||
UErrorCode &status) const;
|
||||
};
|
||||
|
@ -184,7 +181,6 @@ public:
|
|||
const UnicodeString &original,
|
||||
FieldPositionIterator &fpi,
|
||||
const UnicodeString &preExponent,
|
||||
const DecimalFormatStaticSets &decimalFormatSets,
|
||||
UnicodeString &appendTo,
|
||||
UErrorCode &status) const;
|
||||
private:
|
||||
|
@ -211,7 +207,6 @@ public:
|
|||
UnicodeString fPreExponent;
|
||||
DecimalFormat *fDecimalFormat;
|
||||
Style *fStyle;
|
||||
const DecimalFormatStaticSets *fStaticSets;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -1412,7 +1412,7 @@ static const char *lenientAffixTestCases[] = {
|
|||
static const char *lenientMinusTestCases[] = {
|
||||
"-5",
|
||||
"\\u22125",
|
||||
"\\u20105"
|
||||
"\\u27965"
|
||||
};
|
||||
|
||||
static const char *lenientCurrencyTestCases[] = {
|
||||
|
|
|
@ -5,7 +5,13 @@ package com.ibm.icu.impl.number.parse;
|
|||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.ibm.icu.impl.ICUData;
|
||||
import com.ibm.icu.impl.ICUResourceBundle;
|
||||
import com.ibm.icu.impl.UResource;
|
||||
import com.ibm.icu.impl.UResource.Value;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.util.UResourceBundle;
|
||||
|
||||
/**
|
||||
* This class statically initializes UnicodeSets useful for number parsing. Microbenchmarks show this to
|
||||
|
@ -20,8 +26,6 @@ import com.ibm.icu.text.UnicodeSet;
|
|||
public class UnicodeSetStaticCache {
|
||||
public static enum Key {
|
||||
// Ignorables
|
||||
BIDI,
|
||||
WHITESPACE,
|
||||
DEFAULT_IGNORABLES,
|
||||
STRICT_IGNORABLES,
|
||||
|
||||
|
@ -47,9 +51,14 @@ public class UnicodeSetStaticCache {
|
|||
PERMILLE_SIGN,
|
||||
INFINITY,
|
||||
|
||||
// Currency Symbols
|
||||
DOLLAR_SIGN,
|
||||
POUND_SIGN,
|
||||
RUPEE_SIGN,
|
||||
YEN_SIGN, // not in CLDR data, but Currency.java wants it
|
||||
|
||||
// Other
|
||||
DIGITS,
|
||||
CWCF, // TODO: Check if this is being used and remove it if not.
|
||||
|
||||
// Combined Separators with Digits (for lead code points)
|
||||
DIGITS_OR_ALL_SEPARATORS,
|
||||
|
@ -70,6 +79,20 @@ public class UnicodeSetStaticCache {
|
|||
return get(key1).contains(str) ? key1 : chooseFrom(str, key2);
|
||||
}
|
||||
|
||||
public static Key chooseCurrency(String str) {
|
||||
if (get(Key.DOLLAR_SIGN).contains(str)) {
|
||||
return Key.DOLLAR_SIGN;
|
||||
} else if (get(Key.POUND_SIGN).contains(str)) {
|
||||
return Key.POUND_SIGN;
|
||||
} else if (get(Key.RUPEE_SIGN).contains(str)) {
|
||||
return Key.RUPEE_SIGN;
|
||||
} else if (get(Key.YEN_SIGN).contains(str)) {
|
||||
return Key.YEN_SIGN;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static UnicodeSet computeUnion(Key k1, Key k2) {
|
||||
return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).freeze();
|
||||
}
|
||||
|
@ -78,23 +101,98 @@ public class UnicodeSetStaticCache {
|
|||
return new UnicodeSet().addAll(get(k1)).addAll(get(k2)).addAll(get(k3)).freeze();
|
||||
}
|
||||
|
||||
private static void saveSet(Key key, String unicodeSetPattern) {
|
||||
assert unicodeSets.get(key) == null;
|
||||
unicodeSets.put(key, new UnicodeSet(unicodeSetPattern).freeze());
|
||||
}
|
||||
|
||||
/*
|
||||
parse{
|
||||
date{
|
||||
lenient{
|
||||
"[\\--/]",
|
||||
"[\\:∶]",
|
||||
}
|
||||
}
|
||||
general{
|
||||
lenient{
|
||||
"[.․。︒﹒.。]",
|
||||
"[\$﹩$$]",
|
||||
"[£₤]",
|
||||
"[₨₹{Rp}{Rs}]",
|
||||
}
|
||||
}
|
||||
number{
|
||||
lenient{
|
||||
"[\\-‒⁻₋−➖﹣-]",
|
||||
"[,،٫、︐︑﹐﹑,、]",
|
||||
"[+⁺₊➕﬩﹢+]",
|
||||
}
|
||||
stricter{
|
||||
"[,٫︐﹐,]",
|
||||
"[.․﹒.。]",
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
static class ParseDataSink extends UResource.Sink {
|
||||
@Override
|
||||
public void put(com.ibm.icu.impl.UResource.Key key, Value value, boolean noFallback) {
|
||||
UResource.Table contextsTable = value.getTable();
|
||||
for (int i = 0; contextsTable.getKeyAndValue(i, key, value); i++) {
|
||||
if (key.contentEquals("date")) {
|
||||
// ignore
|
||||
} else {
|
||||
assert key.contentEquals("general") || key.contentEquals("number");
|
||||
UResource.Table strictnessTable = value.getTable();
|
||||
for (int j = 0; strictnessTable.getKeyAndValue(j, key, value); j++) {
|
||||
boolean isLenient = key.contentEquals("lenient");
|
||||
UResource.Array array = value.getArray();
|
||||
for (int k = 0; k < array.getSize(); k++) {
|
||||
array.getValue(k, value);
|
||||
String str = value.toString();
|
||||
// There is both lenient and strict data for comma/period,
|
||||
// but not for any of the other symbols.
|
||||
if (str.indexOf('.') != -1) {
|
||||
saveSet(isLenient ? Key.PERIOD : Key.STRICT_PERIOD, str);
|
||||
} else if (str.indexOf(',') != -1) {
|
||||
saveSet(isLenient ? Key.COMMA : Key.STRICT_COMMA, str);
|
||||
} else if (str.indexOf('+') != -1) {
|
||||
saveSet(Key.PLUS_SIGN, str);
|
||||
} else if (str.indexOf('‒') != -1) {
|
||||
saveSet(Key.MINUS_SIGN, str);
|
||||
} else if (str.indexOf('$') != -1) {
|
||||
saveSet(Key.DOLLAR_SIGN, str);
|
||||
} else if (str.indexOf('£') != -1) {
|
||||
saveSet(Key.POUND_SIGN, str);
|
||||
} else if (str.indexOf('₨') != -1) {
|
||||
saveSet(Key.RUPEE_SIGN, str);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static {
|
||||
// These characters are skipped over and ignored at any point in the string, even in strict mode.
|
||||
// See ticket #13084.
|
||||
unicodeSets.put(Key.BIDI, new UnicodeSet("[[:DI:]]").freeze());
|
||||
|
||||
// This set was decided after discussion with icu-design@. See ticket #13309.
|
||||
// These sets were decided after discussion with icu-design@. See tickets #13084 and #13309.
|
||||
// Zs+TAB is "horizontal whitespace" according to UTS #18 (blank property).
|
||||
unicodeSets.put(Key.WHITESPACE, new UnicodeSet("[[:Zs:][\\u0009]]").freeze());
|
||||
unicodeSets.put(Key.DEFAULT_IGNORABLES,
|
||||
new UnicodeSet("[[:Zs:][\\u0009][:Bidi_Control:][:Variation_Selector:]]").freeze());
|
||||
unicodeSets.put(Key.STRICT_IGNORABLES, new UnicodeSet("[[:Bidi_Control:]]").freeze());
|
||||
|
||||
unicodeSets.put(Key.DEFAULT_IGNORABLES, computeUnion(Key.BIDI, Key.WHITESPACE));
|
||||
unicodeSets.put(Key.STRICT_IGNORABLES, get(Key.BIDI));
|
||||
// CLDR provides data for comma, period, minus sign, and plus sign.
|
||||
ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle
|
||||
.getBundleInstance(ICUData.ICU_BASE_NAME, ULocale.ROOT);
|
||||
rb.getAllItemsWithFallback("parse", new ParseDataSink());
|
||||
|
||||
// TODO: Should there be fallback behavior if for some reason these sets didn't get populated?
|
||||
assert unicodeSets.containsKey(Key.COMMA);
|
||||
assert unicodeSets.containsKey(Key.STRICT_COMMA);
|
||||
assert unicodeSets.containsKey(Key.PERIOD);
|
||||
assert unicodeSets.containsKey(Key.STRICT_PERIOD);
|
||||
|
||||
// TODO: Re-generate these sets from the UCD. They probably haven't been updated in a while.
|
||||
unicodeSets.put(Key.COMMA, new UnicodeSet("[,،٫、︐︑﹐﹑,、]").freeze());
|
||||
unicodeSets.put(Key.STRICT_COMMA, new UnicodeSet("[,٫︐﹐,]").freeze());
|
||||
unicodeSets.put(Key.PERIOD, new UnicodeSet("[.․。︒﹒.。]").freeze());
|
||||
unicodeSets.put(Key.STRICT_PERIOD, new UnicodeSet("[.․﹒.。]").freeze());
|
||||
unicodeSets.put(Key.OTHER_GROUPING_SEPARATORS,
|
||||
new UnicodeSet("['٬‘’'\\u0020\\u00A0\\u2000-\\u200A\\u202F\\u205F\\u3000]").freeze());
|
||||
unicodeSets.put(Key.ALL_SEPARATORS,
|
||||
|
@ -102,15 +200,19 @@ public class UnicodeSetStaticCache {
|
|||
unicodeSets.put(Key.STRICT_ALL_SEPARATORS,
|
||||
computeUnion(Key.STRICT_COMMA, Key.STRICT_PERIOD, Key.OTHER_GROUPING_SEPARATORS));
|
||||
|
||||
unicodeSets.put(Key.MINUS_SIGN, new UnicodeSet("[-⁻₋−➖﹣-]").freeze());
|
||||
unicodeSets.put(Key.PLUS_SIGN, new UnicodeSet("[+⁺₊➕﬩﹢+]").freeze());
|
||||
assert unicodeSets.containsKey(Key.MINUS_SIGN);
|
||||
assert unicodeSets.containsKey(Key.PLUS_SIGN);
|
||||
|
||||
unicodeSets.put(Key.PERCENT_SIGN, new UnicodeSet("[%٪]").freeze());
|
||||
unicodeSets.put(Key.PERMILLE_SIGN, new UnicodeSet("[‰؉]").freeze());
|
||||
unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
|
||||
|
||||
assert unicodeSets.containsKey(Key.DOLLAR_SIGN);
|
||||
assert unicodeSets.containsKey(Key.POUND_SIGN);
|
||||
assert unicodeSets.containsKey(Key.RUPEE_SIGN);
|
||||
unicodeSets.put(Key.YEN_SIGN, new UnicodeSet("[¥\\uffe5]").freeze());
|
||||
|
||||
unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
|
||||
unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze());
|
||||
|
||||
unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS));
|
||||
unicodeSets.put(Key.DIGITS_OR_STRICT_ALL_SEPARATORS,
|
||||
|
|
|
@ -14,7 +14,6 @@ import java.text.ParsePosition;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -31,10 +30,12 @@ import com.ibm.icu.impl.ICUResourceBundle;
|
|||
import com.ibm.icu.impl.SimpleCache;
|
||||
import com.ibm.icu.impl.SoftCache;
|
||||
import com.ibm.icu.impl.TextTrieMap;
|
||||
import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache;
|
||||
import com.ibm.icu.text.CurrencyDisplayNames;
|
||||
import com.ibm.icu.text.CurrencyMetaInfo;
|
||||
import com.ibm.icu.text.CurrencyMetaInfo.CurrencyDigits;
|
||||
import com.ibm.icu.text.CurrencyMetaInfo.CurrencyFilter;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale.Category;
|
||||
|
||||
/**
|
||||
|
@ -98,13 +99,6 @@ public class Currency extends MeasureUnit {
|
|||
*/
|
||||
public static final int NARROW_SYMBOL_NAME = 3;
|
||||
|
||||
private static final EquivalenceRelation<String> EQUIVALENT_CURRENCY_SYMBOLS =
|
||||
new EquivalenceRelation<String>()
|
||||
.add("\u00a5", "\uffe5")
|
||||
.add("$", "\ufe69", "\uff04")
|
||||
.add("\u20a8", "\u20b9")
|
||||
.add("\u00a3", "\u20a4");
|
||||
|
||||
/**
|
||||
* Currency Usage used for Decimal Format
|
||||
* @stable ICU 54
|
||||
|
@ -778,8 +772,16 @@ public class Currency extends MeasureUnit {
|
|||
String isoCode = e.getValue();
|
||||
// Register under not just symbol, but under every equivalent symbol as well
|
||||
// e.g short width yen and long width yen.
|
||||
for (String equivalentSymbol : EQUIVALENT_CURRENCY_SYMBOLS.get(symbol)) {
|
||||
symTrie.put(equivalentSymbol, new CurrencyStringInfo(isoCode, symbol));
|
||||
UnicodeSetStaticCache.Key key = UnicodeSetStaticCache.chooseCurrency(symbol);
|
||||
CurrencyStringInfo value = new CurrencyStringInfo(isoCode, symbol);
|
||||
if (key != null) {
|
||||
UnicodeSet equivalents = UnicodeSetStaticCache.get(key);
|
||||
// The symbol itself is included in the UnicodeSet
|
||||
for (String equivalentSymbol : equivalents) {
|
||||
symTrie.put(equivalentSymbol, value);
|
||||
}
|
||||
} else {
|
||||
symTrie.put(symbol, value);
|
||||
}
|
||||
}
|
||||
for (Map.Entry<String, String> e : names.nameMap().entrySet()) {
|
||||
|
@ -1039,34 +1041,6 @@ public class Currency extends MeasureUnit {
|
|||
return info.currencies(filter.withTender());
|
||||
}
|
||||
|
||||
private static final class EquivalenceRelation<T> {
|
||||
|
||||
private Map<T, Set<T>> data = new HashMap<T, Set<T>>();
|
||||
|
||||
@SuppressWarnings("unchecked") // See ticket #11395, this is safe.
|
||||
public EquivalenceRelation<T> add(T... items) {
|
||||
Set<T> group = new HashSet<T>();
|
||||
for (T item : items) {
|
||||
if (data.containsKey(item)) {
|
||||
throw new IllegalArgumentException("All groups passed to add must be disjoint.");
|
||||
}
|
||||
group.add(item);
|
||||
}
|
||||
for (T item : items) {
|
||||
data.put(item, group);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public Set<T> get(T item) {
|
||||
Set<T> result = data.get(item);
|
||||
if (result == null) {
|
||||
return Collections.singleton(item);
|
||||
}
|
||||
return Collections.unmodifiableSet(result);
|
||||
}
|
||||
}
|
||||
|
||||
private Object writeReplace() throws ObjectStreamException {
|
||||
return new MeasureUnitProxy(type, subType);
|
||||
}
|
||||
|
|
|
@ -1764,7 +1764,7 @@ public class NumberFormatTest extends TestFmwk {
|
|||
}
|
||||
|
||||
// Test default ignorable characters. These should work in both lenient and strict.
|
||||
UnicodeSet defaultIgnorables = new UnicodeSet("[[:Default_Ignorable_Code_Point:]]").freeze();
|
||||
UnicodeSet defaultIgnorables = new UnicodeSet("[[:Bidi_Control:]]").freeze();
|
||||
fmt.setParseStrict(false);
|
||||
for (String ignorable : defaultIgnorables) {
|
||||
String str = "a b " + ignorable + "1234c ";
|
||||
|
|
Loading…
Add table
Reference in a new issue