diff --git a/icu4c/source/common/ucurr.cpp b/icu4c/source/common/ucurr.cpp index 37c3d79e4d1..6ce53c2d5e5 100644 --- a/icu4c/source/common/ucurr.cpp +++ b/icu4c/source/common/ucurr.cpp @@ -16,6 +16,8 @@ #include "unicode/ures.h" #include "unicode/ustring.h" #include "unicode/parsepos.h" +#include "unicode/uniset.h" +#include "unicode/utf16.h" #include "ustr_imp.h" #include "charstr.h" #include "cmemory.h" @@ -1287,17 +1289,28 @@ static void linearSearch(const CurrencyNameStruct* currencyNames, int32_t begin, int32_t end, const UChar* text, int32_t textLen, + int32_t *partialMatchLen, int32_t *maxMatchLen, int32_t* maxMatchIndex) { + int32_t initialPartialMatchLen = *partialMatchLen; for (int32_t index = begin; index <= end; ++index) { int32_t len = currencyNames[index].currencyNameLen; if (len > *maxMatchLen && len <= textLen && uprv_memcmp(currencyNames[index].currencyName, text, len * sizeof(UChar)) == 0) { + *partialMatchLen = MAX(*partialMatchLen, len); *maxMatchIndex = index; *maxMatchLen = len; #ifdef UCURR_DEBUG printf("maxMatchIndex = %d, maxMatchLen = %d\n", *maxMatchIndex, *maxMatchLen); #endif + } else { + // Check for partial matches. + for (int32_t i=initialPartialMatchLen; icurrencyNames; - total_currency_name_count = cacheEntry->totalCurrencyNameCount; - currencySymbols = cacheEntry->currencySymbols; - total_currency_symbol_count = cacheEntry->totalCurrencySymbolCount; ++(cacheEntry->refCount); } umtx_unlock(&gCurrencyCacheMutex); if (found == -1) { collectCurrencyNames(locale, ¤cyNames, &total_currency_name_count, ¤cySymbols, &total_currency_symbol_count, ec); if (U_FAILURE(ec)) { - return; + return NULL; } umtx_lock(&gCurrencyCacheMutex); // check again. @@ -1505,15 +1511,45 @@ uprv_parseCurrency(const char* locale, deleteCurrencyNames(currencyNames, total_currency_name_count); deleteCurrencyNames(currencySymbols, total_currency_symbol_count); cacheEntry = currCache[found]; - currencyNames = cacheEntry->currencyNames; - total_currency_name_count = cacheEntry->totalCurrencyNameCount; - currencySymbols = cacheEntry->currencySymbols; - total_currency_symbol_count = cacheEntry->totalCurrencySymbolCount; ++(cacheEntry->refCount); } umtx_unlock(&gCurrencyCacheMutex); } + return cacheEntry; +} + +static void releaseCacheEntry(CurrencyNameCacheEntry* cacheEntry) { + umtx_lock(&gCurrencyCacheMutex); + --(cacheEntry->refCount); + if (cacheEntry->refCount == 0) { // remove + deleteCacheEntry(cacheEntry); + } + umtx_unlock(&gCurrencyCacheMutex); +} + +U_CAPI void +uprv_parseCurrency(const char* locale, + const icu::UnicodeString& text, + icu::ParsePosition& pos, + int8_t type, + int32_t* partialMatchLen, + UChar* result, + UErrorCode& ec) { + U_NAMESPACE_USE + if (U_FAILURE(ec)) { + return; + } + CurrencyNameCacheEntry* cacheEntry = getCacheEntry(locale, ec); + if (U_FAILURE(ec)) { + return; + } + + int32_t total_currency_name_count = cacheEntry->totalCurrencyNameCount; + CurrencyNameStruct* currencyNames = cacheEntry->currencyNames; + int32_t total_currency_symbol_count = cacheEntry->totalCurrencySymbolCount; + CurrencyNameStruct* currencySymbols = cacheEntry->currencySymbols; + int32_t start = pos.getIndex(); UChar inputText[MAX_CURRENCY_NAME_LEN]; @@ -1523,11 +1559,14 @@ uprv_parseCurrency(const char* locale, UErrorCode ec1 = U_ZERO_ERROR; textLen = u_strToUpper(upperText, MAX_CURRENCY_NAME_LEN, inputText, textLen, locale, &ec1); + // Make sure partialMatchLen is initialized + *partialMatchLen = 0; + int32_t max = 0; int32_t matchIndex = -1; // case in-sensitive comparision against currency names searchCurrencyName(currencyNames, total_currency_name_count, - upperText, textLen, &max, &matchIndex); + upperText, textLen, partialMatchLen, &max, &matchIndex); #ifdef UCURR_DEBUG printf("search in names, max = %d, matchIndex = %d\n", max, matchIndex); @@ -1538,7 +1577,8 @@ uprv_parseCurrency(const char* locale, if (type != UCURR_LONG_NAME) { // not name only // case sensitive comparison against currency symbols and ISO code. searchCurrencyName(currencySymbols, total_currency_symbol_count, - inputText, textLen, + inputText, textLen, + partialMatchLen, &maxInSymbol, &matchIndexInSymbol); } @@ -1555,15 +1595,38 @@ uprv_parseCurrency(const char* locale, } else if (maxInSymbol >= max && matchIndexInSymbol != -1) { u_charsToUChars(currencySymbols[matchIndexInSymbol].IsoCode, result, 4); pos.setIndex(start + maxInSymbol); - } + } // decrease reference count - umtx_lock(&gCurrencyCacheMutex); - --(cacheEntry->refCount); - if (cacheEntry->refCount == 0) { // remove - deleteCacheEntry(cacheEntry); + releaseCacheEntry(cacheEntry); +} + +void uprv_currencyLeads(const char* locale, icu::UnicodeSet& result, UErrorCode& ec) { + U_NAMESPACE_USE + if (U_FAILURE(ec)) { + return; } - umtx_unlock(&gCurrencyCacheMutex); + CurrencyNameCacheEntry* cacheEntry = getCacheEntry(locale, ec); + if (U_FAILURE(ec)) { + return; + } + + for (int32_t i=0; itotalCurrencySymbolCount; i++) { + const CurrencyNameStruct& info = cacheEntry->currencySymbols[i]; + UChar32 cp; + U16_GET(info.currencyName, 0, 0, info.currencyNameLen, cp); + result.add(cp); + } + + for (int32_t i=0; itotalCurrencyNameCount; i++) { + const CurrencyNameStruct& info = cacheEntry->currencyNames[i]; + UChar32 cp; + U16_GET(info.currencyName, 0, 0, info.currencyNameLen, cp); + result.add(cp); + } + + // decrease reference count + releaseCacheEntry(cacheEntry); } diff --git a/icu4c/source/common/ucurrimp.h b/icu4c/source/common/ucurrimp.h index 6e468fd4c94..6d9588295df 100644 --- a/icu4c/source/common/ucurrimp.h +++ b/icu4c/source/common/ucurrimp.h @@ -13,6 +13,7 @@ #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/parsepos.h" +#include "unicode/uniset.h" /** * Internal method. Given a currency ISO code and a locale, return @@ -36,6 +37,8 @@ uprv_getStaticCurrencyName(const UChar* iso, const char* loc, * match, then the display name is preferred, unless it's length * is less than 3. * + * The parameters must not be NULL. + * * @param locale the locale of the display names to match * @param text the text to parse * @param pos input-output position; on input, the position within @@ -43,6 +46,8 @@ uprv_getStaticCurrencyName(const UChar* iso, const char* loc, * on output, the position after the last matched character. If * the parse fails, the position in unchanged upon output. * @param type currency type to parse against, LONG_NAME only or not + * @param partialMatchLen The length of the longest matching prefix; + * this may be nonzero even if no full currency was matched. * @return the ISO 4217 code, as a string, of the best match, or * null if there is no match * @@ -53,9 +58,21 @@ uprv_parseCurrency(const char* locale, const icu::UnicodeString& text, icu::ParsePosition& pos, int8_t type, + int32_t* partialMatchLen, UChar* result, UErrorCode& ec); +/** + * Puts all possible first-characters of a currency into the + * specified UnicodeSet. + * + * @param locale the locale of the display names of interest + * @param result the UnicodeSet to which to add the starting characters + */ +void uprv_currencyLeads(const char* locale, icu::UnicodeSet& result, UErrorCode& ec); + + + #endif /* #ifndef _UCURR_IMP_H_ */ //eof diff --git a/icu4c/source/i18n/Makefile.in b/icu4c/source/i18n/Makefile.in index 75c5565d4de..a24adbeb08d 100644 --- a/icu4c/source/i18n/Makefile.in +++ b/icu4c/source/i18n/Makefile.in @@ -109,7 +109,8 @@ number_integerwidth.o number_longnames.o number_modifiers.o number_notation.o \ number_padding.o number_patternmodifier.o number_patternstring.o \ number_rounding.o number_scientific.o number_stringbuilder.o \ numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o \ -numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o +numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o \ +numparse_currency.o ## Header files to install diff --git a/icu4c/source/i18n/decimfmt.cpp b/icu4c/source/i18n/decimfmt.cpp index 3861db3df68..713abd7f9fe 100644 --- a/icu4c/source/i18n/decimfmt.cpp +++ b/icu4c/source/i18n/decimfmt.cpp @@ -2171,10 +2171,11 @@ int32_t DecimalFormat::compareComplexAffix(const UnicodeString& affixPat, // determine our locale. const char* loc = fCurrencyPluralInfo->getLocale().getName(); ParsePosition ppos(pos); + int32_t currMatchLen = 0; UChar curr[4]; UErrorCode ec = U_ZERO_ERROR; // Delegate parse of display name => ISO code to Currency - uprv_parseCurrency(loc, text, ppos, type, curr, ec); + uprv_parseCurrency(loc, text, ppos, type, &currMatchLen, curr, ec); // If parse succeeds, populate currency[0] if (U_SUCCESS(ec) && ppos.getIndex() != pos) { diff --git a/icu4c/source/i18n/numparse_currency.cpp b/icu4c/source/i18n/numparse_currency.cpp new file mode 100644 index 00000000000..7a78a3bbb7d --- /dev/null +++ b/icu4c/source/i18n/numparse_currency.cpp @@ -0,0 +1,67 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT + +#include "numparse_types.h" +#include "numparse_currency.h" +#include "ucurrimp.h" +#include "unicode/errorcode.h" + +using namespace icu; +using namespace icu::numparse; +using namespace icu::numparse::impl; + + +CurrencyNamesMatcher::CurrencyNamesMatcher(const Locale& locale, UErrorCode& status) + : fLocaleName(locale.getName(), -1, status) {} + +bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { + if (result.currencyCode[0] != 0) { + return false; + } + + // NOTE: This requires a new UnicodeString to be allocated, instead of using the StringSegment. + // This should be fixed with #13584. + UnicodeString segmentString = segment.toUnicodeString(); + + // Try to parse the currency + ParsePosition ppos(0); + int32_t partialMatchLen = 0; + uprv_parseCurrency( + fLocaleName.data(), + segmentString, + ppos, + UCURR_SYMBOL_NAME, // checks for both UCURR_SYMBOL_NAME and UCURR_LONG_NAME + &partialMatchLen, + result.currencyCode, + status); + + // Possible partial match + bool partialMatch = partialMatchLen == segment.length(); + + if (U_SUCCESS(status) && ppos.getIndex() != 0) { + // Complete match. + // NOTE: The currency code should already be saved in the ParsedNumber. + segment.adjustOffset(ppos.getIndex()); + result.setCharsConsumed(segment); + } + + return partialMatch; +} + +const UnicodeSet* CurrencyNamesMatcher::getLeadCodePoints() const { + ErrorCode status; + UnicodeSet* leadCodePoints = new UnicodeSet(); + uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status); + // Always apply case mapping closure for currencies + leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS); + leadCodePoints->freeze(); + + return leadCodePoints; +} + + +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_currency.h b/icu4c/source/i18n/numparse_currency.h new file mode 100644 index 00000000000..49b367a8964 --- /dev/null +++ b/icu4c/source/i18n/numparse_currency.h @@ -0,0 +1,47 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +#ifndef __NUMPARSE_CURRENCY_H__ +#define __NUMPARSE_CURRENCY_H__ + +#include "numparse_types.h" +#include "charstr.h" + +U_NAMESPACE_BEGIN namespace numparse { +namespace impl { + + +/** + * Matches currencies according to all available strings in locale data. + * + * The implementation of this class is different between J and C. See #13584 for a follow-up. + * + * @author sffc + */ +class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory { + public: + CurrencyNamesMatcher() = default; // WARNING: Leaves the object in an unusable state + + CurrencyNamesMatcher(const Locale& locale, UErrorCode& status); + + bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; + + const UnicodeSet* getLeadCodePoints() const override; + + private: + // We could use Locale instead of CharString here, but + // Locale has a non-trivial default constructor. + CharString fLocaleName; + +}; + + +} // namespace impl +} // namespace numparse +U_NAMESPACE_END + +#endif //__NUMPARSE_CURRENCY_H__ +#endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp index 68707439fa2..2fe84fcbc97 100644 --- a/icu4c/source/i18n/numparse_impl.cpp +++ b/icu4c/source/i18n/numparse_impl.cpp @@ -58,7 +58,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); parser->addMatcher(parser->fLocalMatchers.padding = {u"@"}); parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); -// parser.addMatcher(CurrencyTrieMatcher.getInstance(locale)); + parser->addMatcher(parser->fLocalMatchers.currencyNames = {locale, status}); // parser.addMatcher(new RequireNumberMatcher()); parser->freeze(); @@ -91,12 +91,26 @@ void NumberParserImpl::addMatcher(const NumberParseMatcher& matcher) { fMatchers[fNumMatchers] = &matcher; if (fComputeLeads) { - fLeads[fNumMatchers] = matcher.getLeadCodePoints(); + addLeadCodePointsForMatcher(matcher); } fNumMatchers++; } +void NumberParserImpl::addLeadCodePointsForMatcher(const NumberParseMatcher& matcher) { + const UnicodeSet* leadCodePoints = matcher.getLeadCodePoints(); + // TODO: Avoid the clone operation here. + if (0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)) { + UnicodeSet* copy = static_cast(leadCodePoints->cloneAsThawed()); + delete leadCodePoints; + copy->closeOver(USET_ADD_CASE_MAPPINGS); + copy->freeze(); + fLeads[fNumMatchers] = copy; + } else { + fLeads[fNumMatchers] = leadCodePoints; + } +} + void NumberParserImpl::freeze() { fFrozen = true; } diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h index 4745bf152a8..0fe45fa5f42 100644 --- a/icu4c/source/i18n/numparse_impl.h +++ b/icu4c/source/i18n/numparse_impl.h @@ -12,6 +12,7 @@ #include "numparse_symbols.h" #include "numparse_scientific.h" #include "unicode/uniset.h" +#include "numparse_currency.h" U_NAMESPACE_BEGIN namespace numparse { namespace impl { @@ -43,7 +44,7 @@ class NumberParserImpl { bool fComputeLeads; bool fFrozen = false; - // WARNING: All of these matchers start in an uninitialized state. + // WARNING: All of these matchers start in an undefined state (default-constructed). // You must use an assignment operator on them before using. struct { IgnorablesMatcher ignorables; @@ -56,10 +57,13 @@ class NumberParserImpl { PlusSignMatcher plusSign; DecimalMatcher decimal; ScientificMatcher scientific; + CurrencyNamesMatcher currencyNames; } fLocalMatchers; NumberParserImpl(parse_flags_t parseFlags, bool computeLeads); + void addLeadCodePointsForMatcher(const NumberParseMatcher& matcher); + void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const; void parseLongestRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const; diff --git a/icu4c/source/i18n/numparse_parsednumber.cpp b/icu4c/source/i18n/numparse_parsednumber.cpp index 203383692f2..908cffecef7 100644 --- a/icu4c/source/i18n/numparse_parsednumber.cpp +++ b/icu4c/source/i18n/numparse_parsednumber.cpp @@ -23,7 +23,7 @@ void ParsedNumber::clear() { flags = 0; prefix.setToBogus(); suffix.setToBogus(); - currencyCode.setToBogus(); + currencyCode[0] = 0; } void ParsedNumber::setCharsConsumed(const StringSegment& segment) { diff --git a/icu4c/source/i18n/numparse_types.h b/icu4c/source/i18n/numparse_types.h index 5280c41fece..30ad92d3713 100644 --- a/icu4c/source/i18n/numparse_types.h +++ b/icu4c/source/i18n/numparse_types.h @@ -93,7 +93,7 @@ class ParsedNumber { /** * The currency that got consumed. */ - UnicodeString currencyCode; + UChar currencyCode[4]; ParsedNumber(); diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp index 76e193a04be..4140320bc9c 100644 --- a/icu4c/source/test/intltest/numbertest_parse.cpp +++ b/icu4c/source/test/intltest/numbertest_parse.cpp @@ -67,8 +67,8 @@ void NumberParserTest::testBasic() { {3, u"@@@123@@ ", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak? // {3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.}, // {3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.}, -// {3, u"514.23 USD", u"¤0", 10, 514.23}, -// {3, u"514.23 GBP", u"¤0", 10, 514.23}, + {3, u"514.23 USD", u"¤0", 10, 514.23}, + {3, u"514.23 GBP", u"¤0", 10, 514.23}, // {3, u"a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 14, 51423.}, // {3, u"-a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.}, // {3, u"a -𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.}, @@ -88,7 +88,7 @@ void NumberParserTest::testBasic() { {3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.}, {3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142}, {3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142}, -// {7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5}, + {7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5}, // {3, u"a$ b5", u"a ¤ b0", 5, 5.0}, // {3, u"📺1.23", u"📺0;📻0", 6, 1.23}, // {3, u"📻1.23", u"📺0;📻0", 6, -1.23},