From 1ed7deaa8cc6d617ad773832292f2ad320780773 Mon Sep 17 00:00:00 2001 From: Shane Carr Date: Tue, 13 Feb 2018 02:23:52 +0000 Subject: [PATCH] ICU-13574 AffixMatcher is working. All simple parsing tests are passing. X-SVN-Rev: 40903 --- icu4c/source/i18n/numparse_affixes.cpp | 175 ++++++++++-------- icu4c/source/i18n/numparse_affixes.h | 97 ++++++---- icu4c/source/i18n/numparse_compositions.cpp | 8 + icu4c/source/i18n/numparse_compositions.h | 2 + icu4c/source/i18n/numparse_currency.cpp | 16 ++ icu4c/source/i18n/numparse_currency.h | 6 + icu4c/source/i18n/numparse_decimal.cpp | 8 + icu4c/source/i18n/numparse_decimal.h | 2 + icu4c/source/i18n/numparse_impl.cpp | 28 +-- icu4c/source/i18n/numparse_impl.h | 1 + icu4c/source/i18n/numparse_parsednumber.cpp | 10 +- icu4c/source/i18n/numparse_scientific.cpp | 8 + icu4c/source/i18n/numparse_scientific.h | 2 + icu4c/source/i18n/numparse_stringsegment.cpp | 4 + icu4c/source/i18n/numparse_symbols.cpp | 13 ++ icu4c/source/i18n/numparse_symbols.h | 4 + icu4c/source/i18n/numparse_types.h | 3 + icu4c/source/i18n/numparse_unisets.cpp | 4 +- .../source/test/intltest/numbertest_parse.cpp | 88 ++++----- .../icu/impl/number/parse/StringSegment.java | 2 + 20 files changed, 313 insertions(+), 168 deletions(-) diff --git a/icu4c/source/i18n/numparse_affixes.cpp b/icu4c/source/i18n/numparse_affixes.cpp index b5d447f192f..97ba4a1c660 100644 --- a/icu4c/source/i18n/numparse_affixes.cpp +++ b/icu4c/source/i18n/numparse_affixes.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_types.h" #include "numparse_affixes.h" #include "numparse_utils.h" @@ -122,52 +126,32 @@ AffixPatternMatcher AffixPatternMatcherBuilder::build() { } -AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const UChar* currencyCode, - const UnicodeString* currency1, - const UnicodeString* currency2, - const DecimalFormatSymbols* dfs, - IgnorablesMatcher* ignorables, const Locale* locale) - : currency1(currency1), - currency2(currency2), - dfs(dfs), - ignorables(ignorables), - locale(locale), - codePointCount(0), - codePointNumBatches(0) { - utils::copyCurrencyCode(this->currencyCode, currencyCode); -} +CodePointMatcherWarehouse::CodePointMatcherWarehouse() + : codePointCount(0), codePointNumBatches(0) {} -AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse( - AffixTokenMatcherWarehouse&& src) U_NOEXCEPT = default; - -AffixTokenMatcherWarehouse::~AffixTokenMatcherWarehouse() { +CodePointMatcherWarehouse::~CodePointMatcherWarehouse() { // Delete the variable number of batches of code point matchers for (int32_t i = 0; i < codePointNumBatches; i++) { delete[] codePointsOverflow[i]; } } -NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() { - return fMinusSign = {*dfs, true}; +CodePointMatcherWarehouse::CodePointMatcherWarehouse(CodePointMatcherWarehouse&& src) U_NOEXCEPT + : codePoints(std::move(src.codePoints)), + codePointsOverflow(std::move(src.codePointsOverflow)), + codePointCount(src.codePointCount), + codePointNumBatches(src.codePointNumBatches) {} + +CodePointMatcherWarehouse& +CodePointMatcherWarehouse::operator=(CodePointMatcherWarehouse&& src) U_NOEXCEPT { + codePoints = std::move(src.codePoints); + codePointsOverflow = std::move(src.codePointsOverflow); + codePointCount = src.codePointCount; + codePointNumBatches = src.codePointNumBatches; + return *this; } -NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() { - return fPlusSign = {*dfs, true}; -} - -NumberParseMatcher& AffixTokenMatcherWarehouse::percent() { - return fPercent = {*dfs}; -} - -NumberParseMatcher& AffixTokenMatcherWarehouse::permille() { - return fPermille = {*dfs}; -} - -NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) { - return fCurrency = {{*locale, status}, {currencyCode, *currency1, *currency2}}; -} - -NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp) { +NumberParseMatcher& CodePointMatcherWarehouse::nextCodePointMatcher(UChar32 cp) { if (codePointCount < CODE_POINT_STACK_CAPACITY) { return codePoints[codePointCount++] = {cp}; } @@ -186,6 +170,39 @@ NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp) } +AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData) + : fSetupData(setupData) {} + +NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() { + return fMinusSign = {fSetupData->dfs, true}; +} + +NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() { + return fPlusSign = {fSetupData->dfs, true}; +} + +NumberParseMatcher& AffixTokenMatcherWarehouse::percent() { + return fPercent = {fSetupData->dfs}; +} + +NumberParseMatcher& AffixTokenMatcherWarehouse::permille() { + return fPermille = {fSetupData->dfs}; +} + +NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) { + return fCurrency = {{fSetupData->locale, status}, + {fSetupData->currencyCode, fSetupData->currency1, fSetupData->currency2}}; +} + +IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() { + return fSetupData->ignorables; +} + +NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp) { + return fCodePoints.nextCodePointMatcher(cp); +} + + CodePointMatcher::CodePointMatcher(UChar32 cp) : fCp(cp) {} @@ -207,9 +224,13 @@ const UnicodeSet& CodePointMatcher::getLeadCodePoints() { return *fLocalLeadCodePoints; } +UnicodeString CodePointMatcher::toString() const { + return u""; +} + AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern, - AffixTokenMatcherWarehouse& warehouse, + AffixTokenMatcherWarehouse& tokenWarehouse, parse_flags_t parseFlags, bool* success, UErrorCode& status) { if (affixPattern.isEmpty()) { @@ -222,10 +243,10 @@ AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& a if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) { ignorables = nullptr; } else { - ignorables = warehouse.ignorables; + ignorables = &tokenWarehouse.ignorables(); } - AffixPatternMatcherBuilder builder(affixPattern, warehouse, ignorables); + AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables); AffixUtils::iterateWithConsumer(UnicodeStringCharSequence(affixPattern), builder, status); return builder.build(); } @@ -243,10 +264,9 @@ bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const { } -AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse& warehouse) - : fAffixTokenMatcherWarehouse(std::move(warehouse)) {} - -AffixMatcherWarehouse& AffixMatcherWarehouse::operator=(AffixMatcherWarehouse&& src) = default; +AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse) + : fTokenWarehouse(tokenWarehouse) { +} bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo, const IgnorablesMatcher& ignorables, parse_flags_t parseFlags, @@ -278,18 +298,14 @@ bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInf return true; } -AffixMatcherWarehouse AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo, - MutableMatcherCollection& output, - AffixTokenMatcherWarehouse tokenWarehouse, - const IgnorablesMatcher& ignorables, - parse_flags_t parseFlags, - UErrorCode& status) { +void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo, + MutableMatcherCollection& output, + const IgnorablesMatcher& ignorables, + parse_flags_t parseFlags, UErrorCode& status) { if (!isInteresting(patternInfo, ignorables, parseFlags, status)) { - return {}; + return; } - AffixMatcherWarehouse warehouse(tokenWarehouse); - // The affixes have interesting characters, or we are in strict mode. // Use initial capacity of 6, the highest possible number of AffixMatchers. UnicodeString sb; @@ -309,21 +325,19 @@ AffixMatcherWarehouse AffixMatcherWarehouse::createAffixMatchers(const AffixPatt bool hasPrefix = false; PatternStringUtils::patternInfoToStringBuilder( patternInfo, true, signum, signDisplay, StandardPlural::OTHER, false, sb); - warehouse.fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( - sb, tokenWarehouse, parseFlags, &hasPrefix, status); - AffixPatternMatcher* prefix = hasPrefix - ? &warehouse.fAffixPatternMatchers[numAffixPatternMatchers++] - : nullptr; + fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( + sb, *fTokenWarehouse, parseFlags, &hasPrefix, status); + AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++] + : nullptr; // Generate Suffix bool hasSuffix = false; PatternStringUtils::patternInfoToStringBuilder( patternInfo, false, signum, signDisplay, StandardPlural::OTHER, false, sb); - warehouse.fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( - sb, tokenWarehouse, parseFlags, &hasSuffix, status); - AffixPatternMatcher* suffix = hasSuffix - ? &warehouse.fAffixPatternMatchers[numAffixPatternMatchers++] - : nullptr; + fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern( + sb, *fTokenWarehouse, parseFlags, &hasSuffix, status); + AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++] + : nullptr; if (signum == 1) { posPrefix = prefix; @@ -338,14 +352,14 @@ AffixMatcherWarehouse AffixMatcherWarehouse::createAffixMatchers(const AffixPatt // Note: it is indeed possible for posPrefix and posSuffix to both be null. // We still need to add that matcher for strict mode to work. - warehouse.fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags}; + fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags}; if (includeUnpaired && prefix != nullptr && suffix != nullptr) { // The following if statements are designed to prevent adding two identical matchers. - if (signum == 1 || equals(prefix, posPrefix)) { - warehouse.fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags}; + if (signum == 1 || !equals(prefix, posPrefix)) { + fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags}; } - if (signum == 1 || equals(suffix, posSuffix)) { - warehouse.fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags}; + if (signum == 1 || !equals(suffix, posSuffix)) { + fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags}; } } } @@ -356,19 +370,20 @@ AffixMatcherWarehouse AffixMatcherWarehouse::createAffixMatchers(const AffixPatt do { madeChanges = false; for (int32_t i = 1; i < numAffixMatchers; i++) { - if (warehouse.fAffixMatchers[i - 1].compareTo(warehouse.fAffixMatchers[i]) > 0) { + if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) { madeChanges = true; - AffixMatcher temp = std::move(warehouse.fAffixMatchers[i - 1]); - warehouse.fAffixMatchers[i - 1] = std::move(warehouse.fAffixMatchers[i]); - warehouse.fAffixMatchers[i] = std::move(temp); + AffixMatcher temp = std::move(fAffixMatchers[i - 1]); + fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]); + fAffixMatchers[i] = std::move(temp); } } } while (madeChanges); - for (int32_t i = 0; i < numAffixMatchers; i++) { - output.addMatcher(warehouse.fAffixMatchers[i]); - } - return warehouse; + for (int32_t i = 0; i < numAffixMatchers; i++) { + // Enable the following line to debug affixes + //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl; + output.addMatcher(fAffixMatchers[i]); + } } @@ -454,6 +469,14 @@ int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const { } } +UnicodeString AffixMatcher::toString() const { + bool isNegative = 0 != (fFlags & FLAG_NEGATIVE); + return UnicodeString(u"getPattern() : u"null") + u"#" + + (fSuffix ? fSuffix->getPattern() : u"null") + u">"; + +} + #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_affixes.h b/icu4c/source/i18n/numparse_affixes.h index 17175ce7d90..59789e03950 100644 --- a/icu4c/source/i18n/numparse_affixes.h +++ b/icu4c/source/i18n/numparse_affixes.h @@ -12,6 +12,8 @@ #include "numparse_currency.h" #include "number_affixutils.h" +#include + U_NAMESPACE_BEGIN namespace numparse { namespace impl { @@ -33,11 +35,57 @@ class CodePointMatcher : public NumberParseMatcher, public UMemory { const UnicodeSet& getLeadCodePoints() override; + UnicodeString toString() const override; + private: UChar32 fCp; }; +/** + * A warehouse to retain ownership of CodePointMatchers. + */ +class CodePointMatcherWarehouse : public UMemory { + private: + static constexpr int32_t CODE_POINT_STACK_CAPACITY = 5; // Number of entries directly on the stack + static constexpr int32_t CODE_POINT_BATCH_SIZE = 10; // Number of entries per heap allocation + + public: + CodePointMatcherWarehouse(); + + // A custom destructor is needed to free the memory from MaybeStackArray. + // A custom move constructor and move assignment seem to be needed because of the custom destructor. + + ~CodePointMatcherWarehouse(); + + CodePointMatcherWarehouse(CodePointMatcherWarehouse&& src) U_NOEXCEPT; + + CodePointMatcherWarehouse& operator=(CodePointMatcherWarehouse&& src) U_NOEXCEPT; + + NumberParseMatcher& nextCodePointMatcher(UChar32 cp); + + private: + std::array codePoints; // By value + MaybeStackArray codePointsOverflow; // On heap in "batches" + int32_t codePointCount; // Total for both the ones by value and on heap + int32_t codePointNumBatches; // Number of batches in codePointsOverflow +}; + + +struct AffixTokenMatcherSetupData { + const UChar* currencyCode; + const UnicodeString& currency1; + const UnicodeString& currency2; + const DecimalFormatSymbols& dfs; + IgnorablesMatcher& ignorables; + const Locale& locale; + +// const UChar* currencyCode, const UnicodeString* currency1, +// const UnicodeString* currency2, const DecimalFormatSymbols* dfs, +// IgnorablesMatcher* ignorables, const Locale* locale +}; + + /** * Small helper class that generates matchers for individual tokens for AffixPatternMatcher. * @@ -48,21 +96,11 @@ class CodePointMatcher : public NumberParseMatcher, public UMemory { * * @author sffc */ -class AffixTokenMatcherWarehouse { - private: - static constexpr int32_t CODE_POINT_STACK_CAPACITY = 5; // Number of entries directly on the stack - static constexpr int32_t CODE_POINT_BATCH_SIZE = 10; // Number of entries per heap allocation - +class AffixTokenMatcherWarehouse : public UMemory { public: AffixTokenMatcherWarehouse() = default; // WARNING: Leaves the object in an unusable state - AffixTokenMatcherWarehouse(const UChar* currencyCode, const UnicodeString* currency1, - const UnicodeString* currency2, const DecimalFormatSymbols* dfs, - IgnorablesMatcher* ignorables, const Locale* locale); - - AffixTokenMatcherWarehouse(AffixTokenMatcherWarehouse&& src) U_NOEXCEPT; - - ~AffixTokenMatcherWarehouse(); + AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData); NumberParseMatcher& minusSign(); @@ -74,16 +112,13 @@ class AffixTokenMatcherWarehouse { NumberParseMatcher& currency(UErrorCode& status); + IgnorablesMatcher& ignorables(); + NumberParseMatcher& nextCodePointMatcher(UChar32 cp); private: - // NOTE: The following fields may be unsafe to access after construction is done! - UChar currencyCode[4]; - const UnicodeString* currency1; - const UnicodeString* currency2; - const DecimalFormatSymbols* dfs; - IgnorablesMatcher* ignorables; - const Locale* locale; + // NOTE: The following field may be unsafe to access after construction is done! + const AffixTokenMatcherSetupData* fSetupData; // NOTE: These are default-constructed and should not be used until initialized. MinusSignMatcher fMinusSign; @@ -92,10 +127,8 @@ class AffixTokenMatcherWarehouse { PermilleMatcher fPermille; CurrencyAnyMatcher fCurrency; - CodePointMatcher codePoints[CODE_POINT_STACK_CAPACITY]; // By value - MaybeStackArray codePointsOverflow; // On heap in "batches" - int32_t codePointCount; // Total for both the ones by value and on heap - int32_t codePointNumBatches; // Number of batches in codePointsOverflow + // Use a child class for code point matchers, since it requires non-default operators. + CodePointMatcherWarehouse fCodePoints; friend class AffixPatternMatcherBuilder; friend class AffixPatternMatcher; @@ -161,6 +194,8 @@ class AffixMatcher : public NumberParseMatcher, public UMemory { int8_t compareTo(const AffixMatcher& rhs) const; + UnicodeString toString() const override; + private: AffixPatternMatcher* fPrefix; AffixPatternMatcher* fSuffix; @@ -175,23 +210,19 @@ class AffixMatcherWarehouse { public: AffixMatcherWarehouse() = default; // WARNING: Leaves the object in an unusable state - AffixMatcherWarehouse(AffixTokenMatcherWarehouse& warehouse); + AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse); - AffixMatcherWarehouse& operator=(AffixMatcherWarehouse&& src); - - static AffixMatcherWarehouse createAffixMatchers(const AffixPatternProvider& patternInfo, - MutableMatcherCollection& output, - AffixTokenMatcherWarehouse tokenWarehouse, - const IgnorablesMatcher& ignorables, - parse_flags_t parseFlags, UErrorCode& status); + void createAffixMatchers(const AffixPatternProvider& patternInfo, MutableMatcherCollection& output, + const IgnorablesMatcher& ignorables, parse_flags_t parseFlags, + UErrorCode& status); private: // 9 is the limit: positive, zero, and negative, each with prefix, suffix, and prefix+suffix AffixMatcher fAffixMatchers[9]; // 6 is the limit: positive, zero, and negative, a prefix and a suffix for each AffixPatternMatcher fAffixPatternMatchers[6]; - // Store all the tokens used by the AffixPatternMatchers - AffixTokenMatcherWarehouse fAffixTokenMatcherWarehouse; + // Reference to the warehouse for tokens used by the AffixPatternMatchers + AffixTokenMatcherWarehouse* fTokenWarehouse; friend class AffixMatcher; diff --git a/icu4c/source/i18n/numparse_compositions.cpp b/icu4c/source/i18n/numparse_compositions.cpp index 6d8d52d2ba5..11e1cbcdf04 100644 --- a/icu4c/source/i18n/numparse_compositions.cpp +++ b/icu4c/source/i18n/numparse_compositions.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_types.h" #include "numparse_compositions.h" #include "unicode/uniset.h" @@ -113,5 +117,9 @@ const NumberParseMatcher* const* ArraySeriesMatcher::end() const { return fMatchers.getAlias() + fMatchersLen; } +UnicodeString ArraySeriesMatcher::toString() const { + return u""; +} + #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_compositions.h b/icu4c/source/i18n/numparse_compositions.h index 51501a805c2..91db6fa2caa 100644 --- a/icu4c/source/i18n/numparse_compositions.h +++ b/icu4c/source/i18n/numparse_compositions.h @@ -87,6 +87,8 @@ class ArraySeriesMatcher : public SeriesMatcher { const UnicodeSet& getLeadCodePoints() override; + UnicodeString toString() const override; + int32_t length() const override; protected: diff --git a/icu4c/source/i18n/numparse_currency.cpp b/icu4c/source/i18n/numparse_currency.cpp index cf56a94f6d2..b3a317ef716 100644 --- a/icu4c/source/i18n/numparse_currency.cpp +++ b/icu4c/source/i18n/numparse_currency.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_types.h" #include "numparse_currency.h" #include "ucurrimp.h" @@ -66,6 +70,10 @@ const UnicodeSet& CurrencyNamesMatcher::getLeadCodePoints() { return *fLocalLeadCodePoints; } +UnicodeString CurrencyNamesMatcher::toString() const { + return u""; +} + CurrencyCustomMatcher::CurrencyCustomMatcher(const char16_t* currencyCode, const UnicodeString& currency1, const UnicodeString& currency2) @@ -106,6 +114,10 @@ const UnicodeSet& CurrencyCustomMatcher::getLeadCodePoints() { return *fLocalLeadCodePoints; } +UnicodeString CurrencyCustomMatcher::toString() const { + return u""; +} + CurrencyAnyMatcher::CurrencyAnyMatcher() { fMatcherArray[0] = &fNamesMatcher; @@ -151,5 +163,9 @@ const NumberParseMatcher* const* CurrencyAnyMatcher::end() const { return fMatcherArray + 2; } +UnicodeString CurrencyAnyMatcher::toString() const { + return u""; +} + #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_currency.h b/icu4c/source/i18n/numparse_currency.h index 547f6ee0417..3d8cb3a2bfb 100644 --- a/icu4c/source/i18n/numparse_currency.h +++ b/icu4c/source/i18n/numparse_currency.h @@ -32,6 +32,8 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory { const UnicodeSet& getLeadCodePoints() override; + UnicodeString toString() const override; + private: // We could use Locale instead of CharString here, but // Locale has a non-trivial default constructor. @@ -51,6 +53,8 @@ class CurrencyCustomMatcher : public NumberParseMatcher, public UMemory { const UnicodeSet& getLeadCodePoints() override; + UnicodeString toString() const override; + private: UChar fCurrencyCode[4]; UnicodeString fCurrency1; @@ -75,6 +79,8 @@ class CurrencyAnyMatcher : public AnyMatcher, public UMemory { const UnicodeSet& getLeadCodePoints() override; + UnicodeString toString() const override; + protected: const NumberParseMatcher* const* begin() const override; diff --git a/icu4c/source/i18n/numparse_decimal.cpp b/icu4c/source/i18n/numparse_decimal.cpp index e80014fa591..33b6821ff2c 100644 --- a/icu4c/source/i18n/numparse_decimal.cpp +++ b/icu4c/source/i18n/numparse_decimal.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_types.h" #include "numparse_decimal.h" #include "numparse_unisets.h" @@ -312,5 +316,9 @@ const UnicodeSet& DecimalMatcher::getLeadCodePoints() { return *fLocalLeadCodePoints; } +UnicodeString DecimalMatcher::toString() const { + return u""; +} + #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_decimal.h b/icu4c/source/i18n/numparse_decimal.h index 203cb66b4b4..78ad074f190 100644 --- a/icu4c/source/i18n/numparse_decimal.h +++ b/icu4c/source/i18n/numparse_decimal.h @@ -29,6 +29,8 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory { const UnicodeSet& getLeadCodePoints() override; + UnicodeString toString() const override; + private: /** If true, only accept strings whose grouping sizes match the locale */ bool requireGroupingMatch; diff --git a/icu4c/source/i18n/numparse_impl.cpp b/icu4c/source/i18n/numparse_impl.cpp index 5bf373de58a..06efe9d1b1f 100644 --- a/icu4c/source/i18n/numparse_impl.cpp +++ b/icu4c/source/i18n/numparse_impl.cpp @@ -5,7 +5,8 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT -// Allow implicit conversion from char16_t* to UnicodeString for this file +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "number_types.h" @@ -17,6 +18,9 @@ #include "unicode/numberformatter.h" #include +#include +#include +#include "cstr.h" using namespace icu; using namespace icu::number; @@ -35,24 +39,20 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES}; IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables; + const UChar currencyCode[] = u"USD"; UnicodeString currency1(u"IU$"); UnicodeString currency2(u"ICU"); ParsedPatternInfo patternInfo; PatternParser::parseToPatternInfo(patternString, patternInfo, status); - // The following statement sets up the affix matchers. -// AffixMatcherWarehouse warehouse = ; - - parser->fLocalMatchers.affixMatcherWarehouse = std::move(AffixMatcherWarehouse::createAffixMatchers( - patternInfo, - *parser, - AffixTokenMatcherWarehouse( - u"USD", ¤cy1, ¤cy2, &symbols, &ignorables, &locale), - ignorables, - parseFlags, - status)); - + // The following statements set up the affix matchers. + AffixTokenMatcherSetupData affixSetupData = { + currencyCode, currency1, currency2, symbols, ignorables, locale}; + parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData}; + parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse}; + parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers( + patternInfo, *parser, ignorables, parseFlags, status); Grouper grouper = Grouper::forStrategy(UNUM_GROUPING_AUTO); grouper.setLocaleData(patternInfo, locale); @@ -233,7 +233,7 @@ UnicodeString NumberParserImpl::toString() const { UnicodeString result(u"toString()); } result.append(u" ]>", -1); return result; diff --git a/icu4c/source/i18n/numparse_impl.h b/icu4c/source/i18n/numparse_impl.h index cfae156d564..210dcf47549 100644 --- a/icu4c/source/i18n/numparse_impl.h +++ b/icu4c/source/i18n/numparse_impl.h @@ -60,6 +60,7 @@ class NumberParserImpl : public MutableMatcherCollection { ScientificMatcher scientific; CurrencyNamesMatcher currencyNames; AffixMatcherWarehouse affixMatcherWarehouse; + AffixTokenMatcherWarehouse affixTokenMatcherWarehouse; } fLocalMatchers; NumberParserImpl(parse_flags_t parseFlags, bool computeLeads); diff --git a/icu4c/source/i18n/numparse_parsednumber.cpp b/icu4c/source/i18n/numparse_parsednumber.cpp index 908cffecef7..4cf85a0f432 100644 --- a/icu4c/source/i18n/numparse_parsednumber.cpp +++ b/icu4c/source/i18n/numparse_parsednumber.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_types.h" #include @@ -67,7 +71,11 @@ double ParsedNumber::getDouble() const { } // TODO: MIN_LONG - return quantity.toDouble(); + double d = quantity.toDouble(); + if (0 != (flags & FLAG_NEGATIVE)) { + d *= -1; + } + return d; } bool ParsedNumber::isBetterThan(const ParsedNumber& other) { diff --git a/icu4c/source/i18n/numparse_scientific.cpp b/icu4c/source/i18n/numparse_scientific.cpp index 18ade048fb2..bcb777101a4 100644 --- a/icu4c/source/i18n/numparse_scientific.cpp +++ b/icu4c/source/i18n/numparse_scientific.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_types.h" #include "numparse_scientific.h" #include "numparse_unisets.h" @@ -83,5 +87,9 @@ const UnicodeSet& ScientificMatcher::getLeadCodePoints() { return *fLocalLeadCodePoints; } +UnicodeString ScientificMatcher::toString() const { + return u""; +} + #endif /* #if !UCONFIG_NO_FORMATTING */ diff --git a/icu4c/source/i18n/numparse_scientific.h b/icu4c/source/i18n/numparse_scientific.h index 2f4118ff618..0265c106087 100644 --- a/icu4c/source/i18n/numparse_scientific.h +++ b/icu4c/source/i18n/numparse_scientific.h @@ -27,6 +27,8 @@ class ScientificMatcher : public NumberParseMatcher, public UMemory { const UnicodeSet& getLeadCodePoints() override; + UnicodeString toString() const override; + private: UnicodeString fExponentSeparatorString; DecimalMatcher fExponentMatcher; diff --git a/icu4c/source/i18n/numparse_stringsegment.cpp b/icu4c/source/i18n/numparse_stringsegment.cpp index 36838900906..b2dcdc36423 100644 --- a/icu4c/source/i18n/numparse_stringsegment.cpp +++ b/icu4c/source/i18n/numparse_stringsegment.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_types.h" #include "numparse_stringsegment.h" #include "putilimp.h" diff --git a/icu4c/source/i18n/numparse_symbols.cpp b/icu4c/source/i18n/numparse_symbols.cpp index 3ba12a68df7..6492f1321ff 100644 --- a/icu4c/source/i18n/numparse_symbols.cpp +++ b/icu4c/source/i18n/numparse_symbols.cpp @@ -5,6 +5,10 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. +#define UNISTR_FROM_STRING_EXPLICIT + #include "numparse_types.h" #include "numparse_symbols.h" #include "numparse_utils.h" @@ -70,6 +74,11 @@ const UnicodeSet& SymbolMatcher::getLeadCodePoints() { return *fLocalLeadCodePoints; } +UnicodeString SymbolMatcher::toString() const { + // TODO: Customize output for each symbol + return u""; +} + IgnorablesMatcher::IgnorablesMatcher(unisets::Key key) : SymbolMatcher({}, key) { @@ -79,6 +88,10 @@ bool IgnorablesMatcher::isFlexible() const { return true; } +UnicodeString IgnorablesMatcher::toString() const { + return u""; +} + bool IgnorablesMatcher::isDisabled(const ParsedNumber&) const { return false; } diff --git a/icu4c/source/i18n/numparse_symbols.h b/icu4c/source/i18n/numparse_symbols.h index cf5b8d86680..2cc107101da 100644 --- a/icu4c/source/i18n/numparse_symbols.h +++ b/icu4c/source/i18n/numparse_symbols.h @@ -30,6 +30,8 @@ class SymbolMatcher : public NumberParseMatcher, public UMemory { const UnicodeSet& getLeadCodePoints() override; + UnicodeString toString() const override; + virtual bool isDisabled(const ParsedNumber& result) const = 0; virtual void accept(StringSegment& segment, ParsedNumber& result) const = 0; @@ -50,6 +52,8 @@ class IgnorablesMatcher : public SymbolMatcher { bool isFlexible() const override; + UnicodeString toString() const override; + protected: bool isDisabled(const ParsedNumber& result) const override; diff --git a/icu4c/source/i18n/numparse_types.h b/icu4c/source/i18n/numparse_types.h index d958a97b9d6..3f27da05e28 100644 --- a/icu4c/source/i18n/numparse_types.h +++ b/icu4c/source/i18n/numparse_types.h @@ -318,6 +318,9 @@ class NumberParseMatcher { // Default implementation: no-op }; + // String for debugging + virtual UnicodeString toString() const = 0; + protected: // No construction except by subclasses! NumberParseMatcher() = default; diff --git a/icu4c/source/i18n/numparse_unisets.cpp b/icu4c/source/i18n/numparse_unisets.cpp index 625e1ac31dc..fc0274f2a3a 100644 --- a/icu4c/source/i18n/numparse_unisets.cpp +++ b/icu4c/source/i18n/numparse_unisets.cpp @@ -5,8 +5,8 @@ #if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT -// Allow implicit conversion from char16_t* to UnicodeString for this file -// (useful for UnicodeSet constructor) +// Allow implicit conversion from char16_t* to UnicodeString for this file: +// Helpful in toString methods and elsewhere. #define UNISTR_FROM_STRING_EXPLICIT #include "numparse_unisets.h" diff --git a/icu4c/source/test/intltest/numbertest_parse.cpp b/icu4c/source/test/intltest/numbertest_parse.cpp index 15cfb40a05c..16323b52db1 100644 --- a/icu4c/source/test/intltest/numbertest_parse.cpp +++ b/icu4c/source/test/intltest/numbertest_parse.cpp @@ -69,33 +69,33 @@ void NumberParserTest::testBasic() { {3, u"-∞", u"0", 2, -INFINITY}, {3, u"@@@123 @@", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak? {3, u"@@@123@@ ", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak? -// {3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.}, -// {3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.}, + {3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.}, + {3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.}, {3, u"514.23 USD", u"¤0", 10, 514.23}, {3, u"514.23 GBP", u"¤0", 10, 514.23}, -// {3, u"a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 14, 51423.}, -// {3, u"-a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.}, -// {3, u"a -𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.}, + {3, u"a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 14, 51423.}, + {3, u"-a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.}, + {3, u"a -𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.}, {3, u"𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 10, 51423.}, {3, u"[𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, 51423.}, {3, u"𝟱𝟭𝟰𝟮𝟯]", u"[0];(0)", 11, 51423.}, {3, u"[𝟱𝟭𝟰𝟮𝟯]", u"[0];(0)", 12, 51423.}, -// {3, u"(𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, -51423.}, -// {3, u"𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 11, -51423.}, -// {3, u"(𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 12, -51423.}, -// {3, u"𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 10, 51423.}, -// {3, u"{𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 11, 51423.}, -// {3, u"𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 11, 51423.}, -// {3, u"{𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 12, 51423.}, -// {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number -// {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b" + {3, u"(𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, -51423.}, + {3, u"𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 11, -51423.}, + {3, u"(𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 12, -51423.}, + {3, u"𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 10, 51423.}, + {3, u"{𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 11, 51423.}, + {3, u"𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 11, 51423.}, + {3, u"{𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 12, 51423.}, + {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number + {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b" {3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.}, {3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142}, {3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142}, {7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5}, -// {3, u"a$ b5", u"a ¤ b0", 5, 5.0}, -// {3, u"📺1.23", u"📺0;📻0", 6, 1.23}, -// {3, u"📻1.23", u"📺0;📻0", 6, -1.23}, + {3, u"a$ b5", u"a ¤ b0", 5, 5.0}, + {3, u"📺1.23", u"📺0;📻0", 6, 1.23}, + {3, u"📻1.23", u"📺0;📻0", 6, -1.23}, {3, u".00", u"0", 3, 0.0}, {3, u" 1,234", u"a0", 35, 1234.}, // should not hang {3, u"NaN", u"0", 3, NAN}, @@ -215,27 +215,29 @@ void NumberParserTest::testSeriesMatcher() { void NumberParserTest::testCurrencyAnyMatcher() { IcuTestErrorCode status(*this, "testCurrencyAnyMatcher"); - UnicodeString currency1(u"IU$"); - UnicodeString currency2(u"ICU"); - DecimalFormatSymbols symbols("en", status); IgnorablesMatcher ignorables(unisets::DEFAULT_IGNORABLES); - Locale locale("en"); - AffixTokenMatcherWarehouse warehouse(u"ICU", ¤cy1, ¤cy2, &symbols, &ignorables, &locale); + AffixTokenMatcherSetupData affixSetupData = { + u"ICU", + u"IU$", + u"ICU", + {"en", status}, + ignorables, + "en"}; + AffixTokenMatcherWarehouse warehouse(&affixSetupData); NumberParseMatcher& matcher = warehouse.currency(status); - static const struct TestCase{ + static const struct TestCase { const char16_t* input; const char16_t* expectedCurrencyCode; - } cases[] { - { u"", u"\x00" }, - { u"FOO", u"\x00" }, - { u"USD", u"USD" }, - { u"$", u"USD" }, - { u"US dollars", u"USD" }, - { u"eu", u"\x00" }, - { u"euros", u"EUR" }, - { u"ICU", u"ICU" }, - { u"IU$", u"ICU" } }; + } cases[]{{u"", u"\x00"}, + {u"FOO", u"\x00"}, + {u"USD", u"USD"}, + {u"$", u"USD"}, + {u"US dollars", u"USD"}, + {u"eu", u"\x00"}, + {u"euros", u"EUR"}, + {u"ICU", u"ICU"}, + {u"IU$", u"ICU"}}; for (auto& cas : cases) { UnicodeString input(cas.input); @@ -243,7 +245,8 @@ void NumberParserTest::testCurrencyAnyMatcher() { ParsedNumber result; matcher.match(segment, result, status); assertEquals("Parsing " + input, cas.expectedCurrencyCode, result.currencyCode); - assertEquals("Whole string on " + input, + assertEquals( + "Whole string on " + input, cas.expectedCurrencyCode[0] == 0 ? 0 : input.length(), result.charEnd); } @@ -251,13 +254,15 @@ void NumberParserTest::testCurrencyAnyMatcher() { void NumberParserTest::testAffixPatternMatcher() { IcuTestErrorCode status(*this, "testAffixPatternMatcher"); - - UnicodeString currency1(u"foo"); - UnicodeString currency2(u"bar"); - DecimalFormatSymbols symbols("en", status); IgnorablesMatcher ignorables(unisets::DEFAULT_IGNORABLES); - Locale locale("en"); - AffixTokenMatcherWarehouse warehouse(u"EUR", ¤cy1, ¤cy2, &symbols, &ignorables, &locale); + AffixTokenMatcherSetupData affixSetupData = { + u"USD", + u"foo", + u"bar", + {"en", status}, + ignorables, + "en"}; + AffixTokenMatcherWarehouse warehouse(&affixSetupData); static const struct TestCase { bool exactMatch; @@ -269,8 +274,7 @@ void NumberParserTest::testAffixPatternMatcher() { {true, u"+-%", 3, u"+-%"}, {false, u"ab c", 5, u"a bc"}, {true, u"abc", 3, u"abc"}, - {false, u"hello-to+this%very¤long‰string", 59, u"hello-to+this%very USD long‰string"} - }; + {false, u"hello-to+this%very¤long‰string", 59, u"hello-to+this%very USD long‰string"}}; for (auto& cas : cases) { UnicodeString affixPattern(cas.affixPattern); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java index 39416fd7535..95d94bb3735 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/parse/StringSegment.java @@ -9,6 +9,8 @@ import com.ibm.icu.text.UnicodeSet; * A mutable class allowing for a String with a variable offset and length. The charAt, length, and * subSequence methods all operate relative to the fixed offset into the String. * + * TODO: Make sure that this operates only on code point boundaries. + * * @author sffc */ public class StringSegment implements CharSequence {