From 921355c6f05f6b5cc1d7a2e3c4fa9d66ba85fac7 Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Sat, 31 Mar 2018 05:18:51 +0000 Subject: [PATCH] ICU-13634 Refactoring the two separate currency matchers into a single unified CombinedCurrencyMatcher. Allows for easy implementation of currency spacing (included in this changeset) and possibly other currency-related parsing features in the future. X-SVN-Rev: 41181 --- icu4c/source/i18n/numparse_affixes.cpp | 2 +- icu4c/source/i18n/numparse_affixes.h | 2 +- icu4c/source/i18n/numparse_compositions.cpp | 38 ----- icu4c/source/i18n/numparse_compositions.h | 44 ++--- icu4c/source/i18n/numparse_currency.cpp | 158 ++++++++---------- icu4c/source/i18n/numparse_currency.h | 69 ++------ icu4c/source/i18n/numparse_impl.cpp | 5 +- icu4c/source/i18n/numparse_impl.h | 3 +- icu4c/source/test/intltest/numbertest.h | 3 +- .../source/test/intltest/numbertest_parse.cpp | 6 +- .../test/testdata/NumberFormatTestCases.txt | 10 +- .../parse/AffixTokenMatcherFactory.java | 8 +- .../ibm/icu/impl/number/parse/AnyMatcher.java | 92 ---------- .../number/parse/CombinedCurrencyMatcher.java | 157 +++++++++++++++++ .../number/parse/CurrencyCustomMatcher.java | 67 -------- .../number/parse/CurrencyNamesMatcher.java | 82 --------- .../impl/number/parse/NumberParserImpl.java | 8 +- .../icu/dev/test/format/NumberFormatTest.java | 10 ++ .../icu/dev/test/number/NumberParserTest.java | 7 +- 19 files changed, 304 insertions(+), 467 deletions(-) delete mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AnyMatcher.java create mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CombinedCurrencyMatcher.java delete mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java delete mode 100644 icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java diff --git a/icu4c/source/i18n/numparse_affixes.cpp b/icu4c/source/i18n/numparse_affixes.cpp index 83ecdd144b8..013cc01ff87 100644 --- a/icu4c/source/i18n/numparse_affixes.cpp +++ b/icu4c/source/i18n/numparse_affixes.cpp @@ -190,7 +190,7 @@ NumberParseMatcher& AffixTokenMatcherWarehouse::permille() { } NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) { - return fCurrency = {{fSetupData->locale, status}, {fSetupData->currencySymbols, status}}; + return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, status}; } IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() { diff --git a/icu4c/source/i18n/numparse_affixes.h b/icu4c/source/i18n/numparse_affixes.h index 3b79457b6d3..08a7d912c6a 100644 --- a/icu4c/source/i18n/numparse_affixes.h +++ b/icu4c/source/i18n/numparse_affixes.h @@ -125,7 +125,7 @@ class AffixTokenMatcherWarehouse : public UMemory { PlusSignMatcher fPlusSign; PercentMatcher fPercent; PermilleMatcher fPermille; - CurrencyAnyMatcher fCurrency; + CombinedCurrencyMatcher fCurrency; // Use a child class for code point matchers, since it requires non-default operators. CodePointMatcherWarehouse fCodePoints; diff --git a/icu4c/source/i18n/numparse_compositions.cpp b/icu4c/source/i18n/numparse_compositions.cpp index d254c07349d..06aa476a29b 100644 --- a/icu4c/source/i18n/numparse_compositions.cpp +++ b/icu4c/source/i18n/numparse_compositions.cpp @@ -18,44 +18,6 @@ using namespace icu::numparse; using namespace icu::numparse::impl; -bool AnyMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { - int32_t initialOffset = segment.getOffset(); - bool maybeMore = false; - - // NOTE: The range-based for loop calls the virtual begin() and end() methods. - for (auto& matcher : *this) { - maybeMore = maybeMore || matcher->match(segment, result, status); - if (segment.getOffset() != initialOffset) { - // Match succeeded. - // NOTE: Except for a couple edge cases, if a matcher accepted string A, then it will - // accept any string starting with A. Therefore, there is no possibility that matchers - // later in the list may be evaluated on longer strings, and we can exit the loop here. - break; - } - } - - // None of the matchers succeeded. - return maybeMore; -} - -bool AnyMatcher::smokeTest(const StringSegment& segment) const { - // NOTE: The range-based for loop calls the virtual begin() and end() methods. - for (auto& matcher : *this) { - if (matcher->smokeTest(segment)) { - return true; - } - } - return false; -} - -void AnyMatcher::postProcess(ParsedNumber& result) const { - // NOTE: The range-based for loop calls the virtual begin() and end() methods. - for (auto& matcher : *this) { - matcher->postProcess(result); - } -} - - bool SeriesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { ParsedNumber backup(result); diff --git a/icu4c/source/i18n/numparse_compositions.h b/icu4c/source/i18n/numparse_compositions.h index 8d61ab46d32..a0b20c3433c 100644 --- a/icu4c/source/i18n/numparse_compositions.h +++ b/icu4c/source/i18n/numparse_compositions.h @@ -29,27 +29,29 @@ class CompositionMatcher : public NumberParseMatcher { }; -/** - * Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses - * the first matcher in the list to succeed. - * - * NOTE: In C++, this is a base class, unlike ICU4J, which uses a factory-style interface. - * - * @author sffc - * @see SeriesMatcher - */ -class AnyMatcher : public CompositionMatcher { - public: - bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; - - bool smokeTest(const StringSegment& segment) const override; - - void postProcess(ParsedNumber& result) const override; - - protected: - // No construction except by subclasses! - AnyMatcher() = default; -}; +// NOTE: AnyMatcher is no longer being used. The previous definition is shown below. +// The implementation can be found in SVN source control, deleted around March 30, 2018. +///** +// * Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses +// * the first matcher in the list to succeed. +// * +// * NOTE: In C++, this is a base class, unlike ICU4J, which uses a factory-style interface. +// * +// * @author sffc +// * @see SeriesMatcher +// */ +//class AnyMatcher : public CompositionMatcher { +// public: +// bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; +// +// bool smokeTest(const StringSegment& segment) const override; +// +// void postProcess(ParsedNumber& result) const override; +// +// protected: +// // No construction except by subclasses! +// AnyMatcher() = default; +//}; /** diff --git a/icu4c/source/i18n/numparse_currency.cpp b/icu4c/source/i18n/numparse_currency.cpp index 6dee3d28d4e..5c14fd7429e 100644 --- a/icu4c/source/i18n/numparse_currency.cpp +++ b/icu4c/source/i18n/numparse_currency.cpp @@ -20,19 +20,83 @@ using namespace icu::numparse; using namespace icu::numparse::impl; -CurrencyNamesMatcher::CurrencyNamesMatcher(const Locale& locale, UErrorCode& status) - : fLocaleName(locale.getName(), -1, status) { +CombinedCurrencyMatcher::CombinedCurrencyMatcher(const CurrencySymbols& currencySymbols, + const DecimalFormatSymbols& dfs, UErrorCode& status) + : fCurrency1(currencySymbols.getCurrencySymbol(status)), + fCurrency2(currencySymbols.getIntlCurrencySymbol(status)), + afterPrefixInsert(dfs.getPatternForCurrencySpacing(UNUM_CURRENCY_INSERT, false, status)), + beforeSuffixInsert(dfs.getPatternForCurrencySpacing(UNUM_CURRENCY_INSERT, true, status)), + fLocaleName(dfs.getLocale().getName(), -1, status) { + utils::copyCurrencyCode(fCurrencyCode, currencySymbols.getIsoCode()); + + // Compute the full set of characters that could be the first in a currency to allow for + // efficient smoke test. + fLeadCodePoints.add(fCurrency1.char32At(0)); + fLeadCodePoints.add(fCurrency2.char32At(0)); + fLeadCodePoints.add(beforeSuffixInsert.char32At(0)); uprv_currencyLeads(fLocaleName.data(), fLeadCodePoints, status); // Always apply case mapping closure for currencies fLeadCodePoints.closeOver(USET_ADD_CASE_MAPPINGS); fLeadCodePoints.freeze(); } -bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const { +bool CombinedCurrencyMatcher::match(StringSegment& segment, ParsedNumber& result, + UErrorCode& status) const { if (result.currencyCode[0] != 0) { return false; } + // Try to match a currency spacing separator. + int32_t initialOffset = segment.getOffset(); + bool maybeMore = false; + if (result.seenNumber()) { + int32_t overlap = segment.getCommonPrefixLength(beforeSuffixInsert); + if (overlap == beforeSuffixInsert.length()) { + segment.adjustOffset(overlap); + // Note: let currency spacing be a weak match. Don't update chars consumed. + } + maybeMore = maybeMore || overlap == segment.length(); + } + + // Match the currency string, and reset if we didn't find one. + maybeMore = maybeMore || matchCurrency(segment, result, status); + if (result.currencyCode[0] == 0) { + segment.setOffset(initialOffset); + return maybeMore; + } + + // Try to match a currency spacing separator. + if (!result.seenNumber()) { + int32_t overlap = segment.getCommonPrefixLength(afterPrefixInsert); + if (overlap == afterPrefixInsert.length()) { + segment.adjustOffset(overlap); + // Note: let currency spacing be a weak match. Don't update chars consumed. + } + maybeMore = maybeMore || overlap == segment.length(); + } + + return maybeMore; +} + +bool CombinedCurrencyMatcher::matchCurrency(StringSegment& segment, ParsedNumber& result, + UErrorCode& status) const { + + int32_t overlap1 = segment.getCommonPrefixLength(fCurrency1); + if (overlap1 == fCurrency1.length()) { + utils::copyCurrencyCode(result.currencyCode, fCurrencyCode); + segment.adjustOffset(overlap1); + result.setCharsConsumed(segment); + return segment.length() == 0; + } + + int32_t overlap2 = segment.getCommonPrefixLength(fCurrency2); + if (overlap2 == fCurrency2.length()) { + utils::copyCurrencyCode(result.currencyCode, fCurrencyCode); + segment.adjustOffset(overlap2); + result.setCharsConsumed(segment); + return segment.length() == 0; + } + // NOTE: This call site should be improved with #13584. const UnicodeString segmentString = segment.toTempUnicodeString(); @@ -48,9 +112,6 @@ bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, U result.currencyCode, status); - // Possible partial match - bool partialMatch = partialMatchLen == segment.length(); - if (U_SUCCESS(status) && ppos.getIndex() != 0) { // Complete match. // NOTE: The currency code should already be saved in the ParsedNumber. @@ -58,91 +119,16 @@ bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, U result.setCharsConsumed(segment); } - return partialMatch; + return overlap1 == segment.length() || overlap2 == segment.length() || + partialMatchLen == segment.length(); } -bool CurrencyNamesMatcher::smokeTest(const StringSegment& segment) const { +bool CombinedCurrencyMatcher::smokeTest(const StringSegment& segment) const { return segment.startsWith(fLeadCodePoints); } -UnicodeString CurrencyNamesMatcher::toString() const { - return u""; -} - - -CurrencyCustomMatcher::CurrencyCustomMatcher(const CurrencySymbols& currencySymbols, UErrorCode& status) - : fCurrency1(currencySymbols.getCurrencySymbol(status)), - fCurrency2(currencySymbols.getIntlCurrencySymbol(status)) { - utils::copyCurrencyCode(fCurrencyCode, currencySymbols.getIsoCode()); -} - -bool CurrencyCustomMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const { - if (result.currencyCode[0] != 0) { - return false; - } - - int overlap1 = segment.getCommonPrefixLength(fCurrency1); - if (overlap1 == fCurrency1.length()) { - utils::copyCurrencyCode(result.currencyCode, fCurrencyCode); - segment.adjustOffset(overlap1); - result.setCharsConsumed(segment); - } - - int overlap2 = segment.getCommonPrefixLength(fCurrency2); - if (overlap2 == fCurrency2.length()) { - utils::copyCurrencyCode(result.currencyCode, fCurrencyCode); - segment.adjustOffset(overlap2); - result.setCharsConsumed(segment); - } - - return overlap1 == segment.length() || overlap2 == segment.length(); -} - -bool CurrencyCustomMatcher::smokeTest(const StringSegment& segment) const { - return segment.startsWith(fCurrency1) - || segment.startsWith(fCurrency2); -} - -UnicodeString CurrencyCustomMatcher::toString() const { - return u""; -} - - -CurrencyAnyMatcher::CurrencyAnyMatcher() { - fMatcherArray[0] = &fNamesMatcher; - fMatcherArray[1] = &fCustomMatcher; -} - -CurrencyAnyMatcher::CurrencyAnyMatcher(CurrencyNamesMatcher namesMatcher, - CurrencyCustomMatcher customMatcher) - : fNamesMatcher(std::move(namesMatcher)), fCustomMatcher(std::move(customMatcher)) { - fMatcherArray[0] = &fNamesMatcher; - fMatcherArray[1] = &fCustomMatcher; -} - -CurrencyAnyMatcher::CurrencyAnyMatcher(CurrencyAnyMatcher&& src) U_NOEXCEPT - : fNamesMatcher(std::move(src.fNamesMatcher)), fCustomMatcher(std::move(src.fCustomMatcher)) { - fMatcherArray[0] = &fNamesMatcher; - fMatcherArray[1] = &fCustomMatcher; -} - -CurrencyAnyMatcher& CurrencyAnyMatcher::operator=(CurrencyAnyMatcher&& src) U_NOEXCEPT { - fNamesMatcher = std::move(src.fNamesMatcher); - fCustomMatcher = std::move(src.fCustomMatcher); - // Note: do NOT move fMatcherArray - return *this; -} - -const NumberParseMatcher* const* CurrencyAnyMatcher::begin() const { - return fMatcherArray; -} - -const NumberParseMatcher* const* CurrencyAnyMatcher::end() const { - return fMatcherArray + 2; -} - -UnicodeString CurrencyAnyMatcher::toString() const { - return u""; +UnicodeString CombinedCurrencyMatcher::toString() const { + return u""; } diff --git a/icu4c/source/i18n/numparse_currency.h b/icu4c/source/i18n/numparse_currency.h index 1c2a57d2c14..fa7d67b9bde 100644 --- a/icu4c/source/i18n/numparse_currency.h +++ b/icu4c/source/i18n/numparse_currency.h @@ -19,38 +19,21 @@ namespace impl { using ::icu::number::impl::CurrencySymbols; /** - * Matches currencies according to all available strings in locale data. + * Matches a currency, either a custom currency or one from the data bundle. The class is called + * "combined" to emphasize that the currency string may come from one of multiple sources. * - * The implementation of this class is different between J and C. See #13584 for a follow-up. + * Will match currency spacing either before or after the number depending on whether we are currently in + * the prefix or suffix. + * + * The implementation of this class is slightly different between J and C. See #13584 for a follow-up. * * @author sffc */ -class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory { +class CombinedCurrencyMatcher : public NumberParseMatcher, public UMemory { public: - CurrencyNamesMatcher() = default; // WARNING: Leaves the object in an unusable state + CombinedCurrencyMatcher() = default; // WARNING: Leaves the object in an unusable state - CurrencyNamesMatcher(const Locale& locale, UErrorCode& status); - - bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; - - bool smokeTest(const StringSegment& segment) const override; - - UnicodeString toString() const override; - - private: - // We could use Locale instead of CharString here, but - // Locale has a non-trivial default constructor. - CharString fLocaleName; - - UnicodeSet fLeadCodePoints; -}; - - -class CurrencyCustomMatcher : public NumberParseMatcher, public UMemory { - public: - CurrencyCustomMatcher() = default; // WARNING: Leaves the object in an unusable state - - CurrencyCustomMatcher(const CurrencySymbols& currencySymbols, UErrorCode& status); + CombinedCurrencyMatcher(const CurrencySymbols& currencySymbols, const DecimalFormatSymbols& dfs, UErrorCode& status); bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override; @@ -62,36 +45,18 @@ class CurrencyCustomMatcher : public NumberParseMatcher, public UMemory { UChar fCurrencyCode[4]; UnicodeString fCurrency1; UnicodeString fCurrency2; -}; + UnicodeString afterPrefixInsert; + UnicodeString beforeSuffixInsert; -/** - * An implementation of AnyMatcher, allowing for either currency data or locale currency matches. - */ -class CurrencyAnyMatcher : public AnyMatcher, public UMemory { - public: - CurrencyAnyMatcher(); // WARNING: Leaves the object in an unusable state + // We could use Locale instead of CharString here, but + // Locale has a non-trivial default constructor. + CharString fLocaleName; - CurrencyAnyMatcher(CurrencyNamesMatcher namesMatcher, CurrencyCustomMatcher customMatcher); + UnicodeSet fLeadCodePoints; - // Needs custom move constructor/operator since constructor is nontrivial - - CurrencyAnyMatcher(CurrencyAnyMatcher&& src) U_NOEXCEPT; - - CurrencyAnyMatcher& operator=(CurrencyAnyMatcher&& src) U_NOEXCEPT; - - UnicodeString toString() const override; - - protected: - const NumberParseMatcher* const* begin() const override; - - const NumberParseMatcher* const* end() const override; - - private: - CurrencyNamesMatcher fNamesMatcher; - CurrencyCustomMatcher fCustomMatcher; - - const NumberParseMatcher* fMatcherArray[2]; + /** Matches the currency string without concern for currency spacing. */ + bool matchCurrency(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const; }; diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp index 36ddc1f2f19..89db7001a34 100644 --- a/icu4c/source/i18n/numparse_impl.cpp +++ b/icu4c/source/i18n/numparse_impl.cpp @@ -69,7 +69,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& parser->addMatcher(parser->fLocalMatchers.infinity = {symbols}); parser->addMatcher(parser->fLocalMatchers.padding = {u"@"}); parser->addMatcher(parser->fLocalMatchers.scientific = {symbols, grouper}); - parser->addMatcher(parser->fLocalMatchers.currencyNames = {locale, status}); + parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, status}); // parser.addMatcher(new RequireNumberMatcher()); parser->freeze(); @@ -136,8 +136,7 @@ NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatPr //////////////////////// if (parseCurrency || patternInfo.hasCurrencySign()) { - parser->addMatcher(parser->fLocalMatchers.currencyCustom = {currencySymbols, status}); - parser->addMatcher(parser->fLocalMatchers.currencyNames = {locale, status}); + parser->addMatcher(parser->fLocalMatchers.currency = {currencySymbols, symbols, status}); } /////////////////////////////// diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h index 96a259a55fb..308a2ffcf81 100644 --- a/icu4c/source/i18n/numparse_impl.h +++ b/icu4c/source/i18n/numparse_impl.h @@ -68,8 +68,7 @@ class NumberParserImpl : public MutableMatcherCollection { PlusSignMatcher plusSign; DecimalMatcher decimal; ScientificMatcher scientific; - CurrencyNamesMatcher currencyNames; - CurrencyCustomMatcher currencyCustom; + CombinedCurrencyMatcher currency; AffixMatcherWarehouse affixMatcherWarehouse; AffixTokenMatcherWarehouse affixTokenMatcherWarehouse; } fLocalMatchers; diff --git a/icu4c/source/test/intltest/numbertest.h b/icu4c/source/test/intltest/numbertest.h index f8a16e86539..e1a84aab1e3 100644 --- a/icu4c/source/test/intltest/numbertest.h +++ b/icu4c/source/test/intltest/numbertest.h @@ -226,9 +226,10 @@ class NumberParserTest : public IntlTest { void testBasic(); void testLocaleFi(); void testSeriesMatcher(); - void testCurrencyAnyMatcher(); + void testCombinedCurrencyMatcher(); void testAffixPatternMatcher(); void testGroupingDisabled(); + void testCaseFolding(); void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = 0); }; diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp index 11d6d64cdf7..ec8fee01b83 100644 --- a/icu4c/source/test/intltest/numbertest_parse.cpp +++ b/icu4c/source/test/intltest/numbertest_parse.cpp @@ -23,7 +23,7 @@ void NumberParserTest::runIndexedTest(int32_t index, UBool exec, const char*& na TESTCASE_AUTO_BEGIN; TESTCASE_AUTO(testBasic); TESTCASE_AUTO(testSeriesMatcher); - TESTCASE_AUTO(testCurrencyAnyMatcher); + TESTCASE_AUTO(testCombinedCurrencyMatcher); TESTCASE_AUTO(testAffixPatternMatcher); TESTCASE_AUTO_END; } @@ -211,8 +211,8 @@ void NumberParserTest::testSeriesMatcher() { } } -void NumberParserTest::testCurrencyAnyMatcher() { - IcuTestErrorCode status(*this, "testCurrencyAnyMatcher"); +void NumberParserTest::testCombinedCurrencyMatcher() { + IcuTestErrorCode status(*this, "testCombinedCurrencyMatcher"); IgnorablesMatcher ignorables(unisets::DEFAULT_IGNORABLES); Locale locale = Locale::getEnglish(); diff --git a/icu4c/source/test/testdata/NumberFormatTestCases.txt b/icu4c/source/test/testdata/NumberFormatTestCases.txt index b7c7beb8661..f46810bf01c 100644 --- a/icu4c/source/test/testdata/NumberFormatTestCases.txt +++ b/icu4c/source/test/testdata/NumberFormatTestCases.txt @@ -16,12 +16,12 @@ rt: "0.###" 1.0 "1" # Basics fp: "0.####" 0.10005 "0.1" 0.1 fp: - 0.10006 "0.1001" 0.1001 -pat: - "#0.####" +pat: - "0.####" fp: "#.####" 0.10005 "0.1" 0.1 -pat: - "#0.####" +pat: - "0.####" rt: "0" 1234 "1234" -pat: - "#0" +pat: - "0" # Significant digits fp: "@@@" 1.234567 "1.23" 1.23 @@ -79,12 +79,12 @@ fpc: - 1234.56/JPY "\u00A51,235" 1235/JPY # ISO codes that overlap display names (QQQ vs. Q) # recognize real ISO name in parsing, so, can not use fake name as QQQ #fpc: - 123/QQQ "QQQ123.00" 123/QQQ # QQQ is fake -fpc: - 123/GTQ "GTQ123.00" 123/GTQ +fpc: - 123/GTQ "GTQ 123.00" 123/GTQ # ChoiceFormat-based display names fpc: - 1/INR "\u20b91.00" 1/INR fpc: - 2/INR "\u20b92.00" 2/INR # Display names with shared prefix (YDD vs. Y) -fpc: - 100/YDD "YDD100.00" 100/YDD +fpc: - 100/YDD "YDD 100.00" 100/YDD fpc: - 100/CNY "CN\u00a5100.00" 100/CNY # Regression Tests bug#7914 diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixTokenMatcherFactory.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixTokenMatcherFactory.java index 4c051572aa1..769749f7cb9 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixTokenMatcherFactory.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AffixTokenMatcherFactory.java @@ -33,12 +33,8 @@ public class AffixTokenMatcherFactory { return PermilleMatcher.getInstance(symbols); } - public AnyMatcher currency() { - AnyMatcher any = new AnyMatcher(); - any.addMatcher(CurrencyCustomMatcher.getInstance(currency, locale)); - any.addMatcher(CurrencyNamesMatcher.getInstance(locale)); - any.freeze(); - return any; + public CombinedCurrencyMatcher currency() { + return CombinedCurrencyMatcher.getInstance(currency, symbols); } public IgnorablesMatcher ignorables() { diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AnyMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AnyMatcher.java deleted file mode 100644 index e5359ead47c..00000000000 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/AnyMatcher.java +++ /dev/null @@ -1,92 +0,0 @@ -// © 2018 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html#License -package com.ibm.icu.impl.number.parse; - -import java.util.ArrayList; -import java.util.List; - -import com.ibm.icu.impl.StringSegment; - -/** - * Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses - * the first matcher in the list to succeed. - * - * @author sffc - * @see SeriesMatcher - */ -public class AnyMatcher implements NumberParseMatcher { - - protected List matchers = null; - protected boolean frozen = false; - - public void addMatcher(NumberParseMatcher matcher) { - assert !frozen; - if (matchers == null) { - matchers = new ArrayList(); - } - matchers.add(matcher); - } - - public void freeze() { - frozen = true; - } - - @Override - public boolean match(StringSegment segment, ParsedNumber result) { - assert frozen; - if (matchers == null) { - return false; - } - - int initialOffset = segment.getOffset(); - boolean maybeMore = false; - for (int i = 0; i < matchers.size(); i++) { - NumberParseMatcher matcher = matchers.get(i); - maybeMore = maybeMore || matcher.match(segment, result); - if (segment.getOffset() != initialOffset) { - // Match succeeded. - // NOTE: Except for a couple edge cases, if a matcher accepted string A, then it will - // accept any string starting with A. Therefore, there is no possibility that matchers - // later in the list may be evaluated on longer strings, and we can exit the loop here. - break; - } - } - - // None of the matchers succeeded. - return maybeMore; - } - - @Override - public boolean smokeTest(StringSegment segment) { - assert frozen; - if (matchers == null) { - return false; - } - - for (int i = 0; i < matchers.size(); i++) { - if (matchers.get(i).smokeTest(segment)) { - return true; - } - } - return false; - } - - @Override - public void postProcess(ParsedNumber result) { - assert frozen; - if (matchers == null) { - return; - } - - for (int i = 0; i < matchers.size(); i++) { - NumberParseMatcher matcher = matchers.get(i); - matcher.postProcess(result); - } - } - - @Override - public String toString() { - return ""; - } - -} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CombinedCurrencyMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CombinedCurrencyMatcher.java new file mode 100644 index 00000000000..c1d14189b68 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CombinedCurrencyMatcher.java @@ -0,0 +1,157 @@ +// © 2018 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html#License +package com.ibm.icu.impl.number.parse; + +import java.util.Iterator; + +import com.ibm.icu.impl.StringSegment; +import com.ibm.icu.impl.TextTrieMap; +import com.ibm.icu.text.DecimalFormatSymbols; +import com.ibm.icu.text.UnicodeSet; +import com.ibm.icu.util.Currency; +import com.ibm.icu.util.Currency.CurrencyStringInfo; + +/** + * Matches a currency, either a custom currency or one from the data bundle. The class is called + * "combined" to emphasize that the currency string may come from one of multiple sources. + * + * Will match currency spacing either before or after the number depending on whether we are currently in + * the prefix or suffix. + * + * The implementation of this class is slightly different between J and C. See #13584 for a follow-up. + * + * @author sffc + */ +public class CombinedCurrencyMatcher implements NumberParseMatcher { + + private final String isoCode; + private final String currency1; + private final String currency2; + + private final String afterPrefixInsert; + private final String beforeSuffixInsert; + + private final TextTrieMap longNameTrie; + private final TextTrieMap symbolTrie; + + private final UnicodeSet leadCodePoints; + + public static CombinedCurrencyMatcher getInstance(Currency currency, DecimalFormatSymbols dfs) { + // TODO: Cache these instances. They are somewhat expensive. + return new CombinedCurrencyMatcher(currency, dfs); + } + + private CombinedCurrencyMatcher(Currency currency, DecimalFormatSymbols dfs) { + this.isoCode = currency.getSubtype(); + this.currency1 = currency.getSymbol(dfs.getULocale()); + this.currency2 = currency.getCurrencyCode(); + + afterPrefixInsert = dfs + .getPatternForCurrencySpacing(DecimalFormatSymbols.CURRENCY_SPC_INSERT, false); + beforeSuffixInsert = dfs + .getPatternForCurrencySpacing(DecimalFormatSymbols.CURRENCY_SPC_INSERT, true); + + // TODO: Currency trie does not currently have an option for case folding. It defaults to use + // case folding on long-names but not symbols. + longNameTrie = Currency.getParsingTrie(dfs.getULocale(), Currency.LONG_NAME); + symbolTrie = Currency.getParsingTrie(dfs.getULocale(), Currency.SYMBOL_NAME); + + // Compute the full set of characters that could be the first in a currency to allow for + // efficient smoke test. + leadCodePoints = new UnicodeSet(); + leadCodePoints.add(currency1.codePointAt(0)); + leadCodePoints.add(currency2.codePointAt(0)); + leadCodePoints.add(beforeSuffixInsert.codePointAt(0)); + longNameTrie.putLeadCodePoints(leadCodePoints); + symbolTrie.putLeadCodePoints(leadCodePoints); + // Always apply case mapping closure for currencies + leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS); + leadCodePoints.freeze(); + } + + @Override + public boolean match(StringSegment segment, ParsedNumber result) { + if (result.currencyCode != null) { + return false; + } + + // Try to match a currency spacing separator. + int initialOffset = segment.getOffset(); + boolean maybeMore = false; + if (result.seenNumber()) { + int overlap = segment.getCommonPrefixLength(beforeSuffixInsert); + if (overlap == beforeSuffixInsert.length()) { + segment.adjustOffset(overlap); + // Note: let currency spacing be a weak match. Don't update chars consumed. + } + maybeMore = maybeMore || overlap == segment.length(); + } + + // Match the currency string, and reset if we didn't find one. + maybeMore = maybeMore || matchCurrency(segment, result); + if (result.currencyCode == null) { + segment.setOffset(initialOffset); + return maybeMore; + } + + // Try to match a currency spacing separator. + if (!result.seenNumber()) { + int overlap = segment.getCommonPrefixLength(afterPrefixInsert); + if (overlap == afterPrefixInsert.length()) { + segment.adjustOffset(overlap); + // Note: let currency spacing be a weak match. Don't update chars consumed. + } + maybeMore = maybeMore || overlap == segment.length(); + } + + return maybeMore; + } + + /** Matches the currency string without concern for currency spacing. */ + private boolean matchCurrency(StringSegment segment, ParsedNumber result) { + int overlap1 = segment.getCommonPrefixLength(currency1); + if (overlap1 == currency1.length()) { + result.currencyCode = isoCode; + segment.adjustOffset(overlap1); + result.setCharsConsumed(segment); + return segment.length() == 0; + } + + int overlap2 = segment.getCommonPrefixLength(currency2); + if (overlap2 == currency2.length()) { + result.currencyCode = isoCode; + segment.adjustOffset(overlap2); + result.setCharsConsumed(segment); + return segment.length() == 0; + } + + TextTrieMap.Output trieOutput = new TextTrieMap.Output(); + Iterator values = longNameTrie.get(segment, 0, trieOutput); + if (values == null) { + values = symbolTrie.get(segment, 0, trieOutput); + } + if (values != null) { + result.currencyCode = values.next().getISOCode(); + segment.adjustOffset(trieOutput.matchLength); + result.setCharsConsumed(segment); + } + + return overlap1 == segment.length() || overlap2 == segment.length() || trieOutput.partialMatch; + } + + @Override + public boolean smokeTest(StringSegment segment) { + return segment.startsWith(leadCodePoints); + } + + @Override + public void postProcess(ParsedNumber result) { + // No-op + } + + @Override + public String toString() { + return ""; + } + +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java deleted file mode 100644 index 019af7504fc..00000000000 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyCustomMatcher.java +++ /dev/null @@ -1,67 +0,0 @@ -// © 2017 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html#License -package com.ibm.icu.impl.number.parse; - -import com.ibm.icu.impl.StringSegment; -import com.ibm.icu.util.Currency; -import com.ibm.icu.util.ULocale; - -/** - * A matcher for a single currency instance (not the full trie). - */ -public class CurrencyCustomMatcher implements NumberParseMatcher { - - private final String isoCode; - private final String currency1; - private final String currency2; - - public static CurrencyCustomMatcher getInstance(Currency currency, ULocale loc) { - return new CurrencyCustomMatcher(currency.getSubtype(), - currency.getSymbol(loc), - currency.getCurrencyCode()); - } - - private CurrencyCustomMatcher(String isoCode, String currency1, String currency2) { - this.isoCode = isoCode; - this.currency1 = currency1; - this.currency2 = currency2; - } - - @Override - public boolean match(StringSegment segment, ParsedNumber result) { - if (result.currencyCode != null) { - return false; - } - - int overlap1 = segment.getCommonPrefixLength(currency1); - if (overlap1 == currency1.length()) { - result.currencyCode = isoCode; - segment.adjustOffset(overlap1); - result.setCharsConsumed(segment); - } - - int overlap2 = segment.getCommonPrefixLength(currency2); - if (overlap2 == currency2.length()) { - result.currencyCode = isoCode; - segment.adjustOffset(overlap2); - result.setCharsConsumed(segment); - } - - return overlap1 == segment.length() || overlap2 == segment.length(); - } - - @Override - public boolean smokeTest(StringSegment segment) { - return segment.startsWith(currency1) || segment.startsWith(currency2); - } - - @Override - public void postProcess(ParsedNumber result) { - // No-op - } - - @Override - public String toString() { - return ""; - } -} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java deleted file mode 100644 index be2acd42f95..00000000000 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/CurrencyNamesMatcher.java +++ /dev/null @@ -1,82 +0,0 @@ -// © 2017 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html#License -package com.ibm.icu.impl.number.parse; - -import java.util.Iterator; - -import com.ibm.icu.impl.StringSegment; -import com.ibm.icu.impl.TextTrieMap; -import com.ibm.icu.text.UnicodeSet; -import com.ibm.icu.util.Currency; -import com.ibm.icu.util.Currency.CurrencyStringInfo; -import com.ibm.icu.util.ULocale; - -/** - * Matches currencies according to all available strings in locale data. - * - * The implementation of this class is different between J and C. See #13584 for a follow-up. - * - * @author sffc - */ -public class CurrencyNamesMatcher implements NumberParseMatcher { - - private final TextTrieMap longNameTrie; - private final TextTrieMap symbolTrie; - - private final UnicodeSet leadCodePoints; - - public static CurrencyNamesMatcher getInstance(ULocale locale) { - // TODO: Pre-compute some of the more popular locales? - return new CurrencyNamesMatcher(locale); - } - - private CurrencyNamesMatcher(ULocale locale) { - // TODO: Currency trie does not currently have an option for case folding. It defaults to use - // case folding on long-names but not symbols. - longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME); - symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME); - - // Compute the full set of characters that could be the first in a currency to allow for - // efficient smoke test. - leadCodePoints = new UnicodeSet(); - longNameTrie.putLeadCodePoints(leadCodePoints); - symbolTrie.putLeadCodePoints(leadCodePoints); - // Always apply case mapping closure for currencies - leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS); - leadCodePoints.freeze(); - } - - @Override - public boolean match(StringSegment segment, ParsedNumber result) { - if (result.currencyCode != null) { - return false; - } - - TextTrieMap.Output trieOutput = new TextTrieMap.Output(); - Iterator values = longNameTrie.get(segment, 0, trieOutput); - if (values == null) { - values = symbolTrie.get(segment, 0, trieOutput); - } - if (values != null) { - result.currencyCode = values.next().getISOCode(); - segment.adjustOffset(trieOutput.matchLength); - result.setCharsConsumed(segment); - } - return trieOutput.partialMatch; - } - - @Override - public boolean smokeTest(StringSegment segment) { - return segment.startsWith(leadCodePoints); - } - - @Override - public void postProcess(ParsedNumber result) { - // No-op - } - - @Override - public String toString() { - return ""; - } -} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java index f91a61ebd4f..9283523275b 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/NumberParserImpl.java @@ -37,11 +37,12 @@ public class NumberParserImpl { public static NumberParserImpl createSimpleParser(ULocale locale, String pattern, int parseFlags) { NumberParserImpl parser = new NumberParserImpl(parseFlags); + Currency currency = Currency.getInstance("USD"); DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale); IgnorablesMatcher ignorables = IgnorablesMatcher.DEFAULT; AffixTokenMatcherFactory factory = new AffixTokenMatcherFactory(); - factory.currency = Currency.getInstance("USD"); + factory.currency = currency; factory.symbols = symbols; factory.ignorables = ignorables; factory.locale = locale; @@ -61,7 +62,7 @@ public class NumberParserImpl { parser.addMatcher(InfinityMatcher.getInstance(symbols)); parser.addMatcher(PaddingMatcher.getInstance("@")); parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper)); - parser.addMatcher(CurrencyNamesMatcher.getInstance(locale)); + parser.addMatcher(CombinedCurrencyMatcher.getInstance(currency, symbols)); parser.addMatcher(new RequireNumberValidator()); parser.freeze(); @@ -185,8 +186,7 @@ public class NumberParserImpl { //////////////////////// if (parseCurrency || patternInfo.hasCurrencySign()) { - parser.addMatcher(CurrencyCustomMatcher.getInstance(currency, locale)); - parser.addMatcher(CurrencyNamesMatcher.getInstance(locale)); + parser.addMatcher(CombinedCurrencyMatcher.getInstance(currency, symbols)); } /////////////////////////////// diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java index 5f1ca8bfdf6..06ece472110 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java @@ -5974,4 +5974,14 @@ public class NumberFormatTest extends TestFmwk { df.setParseStrict(true); expect2(df, 0.5, "50x%"); } + + @Test + public void testParseIsoStrict() { + DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(ULocale.ENGLISH); + DecimalFormat df = new DecimalFormat("¤¤0;-0¤¤", dfs); + df.setCurrency(Currency.getInstance("USD")); + df.setParseStrict(true); + expect2(df, 45, "USD 45.00"); + expect2(df, -45, "-45.00 USD"); + } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java index b35cd789c96..57aef1547ca 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/number/NumberParserTest.java @@ -13,7 +13,7 @@ import com.ibm.icu.impl.number.CustomSymbolCurrency; import com.ibm.icu.impl.number.DecimalFormatProperties; import com.ibm.icu.impl.number.parse.AffixPatternMatcher; import com.ibm.icu.impl.number.parse.AffixTokenMatcherFactory; -import com.ibm.icu.impl.number.parse.AnyMatcher; +import com.ibm.icu.impl.number.parse.CombinedCurrencyMatcher; import com.ibm.icu.impl.number.parse.IgnorablesMatcher; import com.ibm.icu.impl.number.parse.MinusSignMatcher; import com.ibm.icu.impl.number.parse.NumberParserImpl; @@ -229,12 +229,13 @@ public class NumberParserTest { } @Test - public void testCurrencyAnyMatcher() { + public void testCombinedCurrencyMatcher() { AffixTokenMatcherFactory factory = new AffixTokenMatcherFactory(); factory.locale = ULocale.ENGLISH; CustomSymbolCurrency currency = new CustomSymbolCurrency("ICU", "IU$", "ICU"); factory.currency = currency; - AnyMatcher matcher = factory.currency(); + factory.symbols = DecimalFormatSymbols.getInstance(ULocale.ENGLISH); + CombinedCurrencyMatcher matcher = factory.currency(); Object[][] cases = new Object[][] { { "", null },