ICU-13574 Adding composition matchers (SeriesMatcher and AnyMatcher) to ICU4C in preparation for affix matchers. Also re-working memory management in getLeadCodePoints().

X-SVN-Rev: 40890
This commit is contained in:
Shane Carr 2018-02-10 06:36:07 +00:00
parent 852897ba2c
commit 513f123a8c
25 changed files with 596 additions and 79 deletions

View file

@ -110,7 +110,7 @@ number_padding.o number_patternmodifier.o number_patternstring.o \
number_rounding.o number_scientific.o number_stringbuilder.o \
numparse_stringsegment.o numparse_unisets.o numparse_parsednumber.o \
numparse_impl.o numparse_symbols.o numparse_decimal.o numparse_scientific.o \
numparse_currency.o
numparse_currency.o numparse_affixes.o numparse_compositions.o
## Header files to install

View file

@ -0,0 +1,20 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
#include "numparse_types.h"
#include "numparse_affixes.h"
using namespace icu;
using namespace icu::numparse;
using namespace icu::numparse::impl;
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -0,0 +1,25 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
#ifndef __NUMPARSE_AFFIXES_H__
#define __NUMPARSE_AFFIXES_H__
#include "numparse_types.h"
U_NAMESPACE_BEGIN
namespace numparse {
namespace impl {
} // namespace impl
} // namespace numparse
U_NAMESPACE_END
#endif //__NUMPARSE_AFFIXES_H__
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -0,0 +1,108 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
#include "numparse_types.h"
#include "numparse_compositions.h"
#include "unicode/uniset.h"
using namespace icu;
using namespace icu::numparse;
using namespace icu::numparse::impl;
bool AnyMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
int32_t initialOffset = segment.getOffset();
bool maybeMore = false;
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
for (auto* matcher : *this) {
maybeMore = maybeMore || matcher->match(segment, result, status);
if (segment.getOffset() != initialOffset) {
// Match succeeded.
// NOTE: Except for a couple edge cases, if a matcher accepted string A, then it will
// accept any string starting with A. Therefore, there is no possibility that matchers
// later in the list may be evaluated on longer strings, and we can exit the loop here.
break;
}
}
// None of the matchers succeeded.
return maybeMore;
}
void AnyMatcher::postProcess(ParsedNumber& result) const {
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
for (auto* matcher : *this) {
matcher->postProcess(result);
}
}
bool SeriesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
ParsedNumber backup(result);
int32_t initialOffset = segment.getOffset();
bool maybeMore = true;
for (auto* it = begin(); it < end();) {
const NumberParseMatcher* matcher = *it;
int matcherOffset = segment.getOffset();
if (segment.length() != 0) {
maybeMore = matcher->match(segment, result, status);
} else {
// Nothing for this matcher to match; ask for more.
maybeMore = true;
}
bool success = (segment.getOffset() != matcherOffset);
bool isFlexible = matcher->isFlexible();
if (success && isFlexible) {
// Match succeeded, and this is a flexible matcher. Re-run it.
} else if (success) {
// Match succeeded, and this is NOT a flexible matcher. Proceed to the next matcher.
it++;
} else if (isFlexible) {
// Match failed, and this is a flexible matcher. Try again with the next matcher.
it++;
} else {
// Match failed, and this is NOT a flexible matcher. Exit.
segment.setOffset(initialOffset);
result = backup;
return maybeMore;
}
}
// All matchers in the series succeeded.
return maybeMore;
}
void SeriesMatcher::postProcess(ParsedNumber& result) const {
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
for (auto* matcher : *this) {
matcher->postProcess(result);
}
}
ArraySeriesMatcher::ArraySeriesMatcher(NumberParseMatcher** matchers, int32_t matchersLen)
: fMatchers(matchers), fMatchersLen(matchersLen) {}
const UnicodeSet& ArraySeriesMatcher::getLeadCodePoints() {
// SeriesMatchers are never allowed to start with a Flexible matcher.
U_ASSERT(!fMatchers[0]->isFlexible());
return fMatchers[0]->getLeadCodePoints();
}
const NumberParseMatcher* const* ArraySeriesMatcher::begin() const {
return fMatchers.getAlias();
}
const NumberParseMatcher* const* ArraySeriesMatcher::end() const {
return fMatchers.getAlias() + fMatchersLen;
}
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -0,0 +1,100 @@
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
#ifndef __SOURCE_NUMPARSE_COMPOSITIONS__
#define __SOURCE_NUMPARSE_COMPOSITIONS__
#include "numparse_types.h"
U_NAMESPACE_BEGIN namespace numparse {
namespace impl {
/**
* Base class for AnyMatcher and SeriesMatcher.
*/
class CompositionMatcher : public NumberParseMatcher {
protected:
// No construction except by subclasses!
CompositionMatcher() = default;
// To be overridden by subclasses (used for iteration):
virtual const NumberParseMatcher* const* begin() const = 0;
// To be overridden by subclasses (used for iteration):
virtual const NumberParseMatcher* const* end() const = 0;
};
/**
* Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses
* the first matcher in the list to succeed.
*
* NOTE: In C++, this is a base class, unlike ICU4J, which uses a factory-style interface.
*
* @author sffc
* @see SeriesMatcher
*/
class AnyMatcher : public CompositionMatcher {
public:
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
void postProcess(ParsedNumber& result) const override;
protected:
// No construction except by subclasses!
AnyMatcher() = default;
};
/**
* Composes a number of matchers, running one after another. Matches the input string only if all of the
* matchers in the series succeed. Performs greedy matches within the context of the series.
*
* @author sffc
* @see AnyMatcher
*/
class SeriesMatcher : public CompositionMatcher {
public:
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
void postProcess(ParsedNumber& result) const override;
protected:
// No construction except by subclasses!
SeriesMatcher() = default;
};
/**
* An implementation of SeriesMatcher that references an array of matchers.
*
* The object adopts the array, but NOT the matchers contained inside the array.
*/
class ArraySeriesMatcher : public SeriesMatcher {
public:
/** The array is adopted, but NOT the matchers inside the array. */
ArraySeriesMatcher(NumberParseMatcher** matchers, int32_t matchersLen);
const UnicodeSet& getLeadCodePoints() override;
protected:
const NumberParseMatcher* const* begin() const override;
const NumberParseMatcher* const* end() const override;
private:
LocalArray<NumberParseMatcher*> fMatchers;
int32_t fMatchersLen;
};
} // namespace impl
} // namespace numparse
U_NAMESPACE_END
#endif //__SOURCE_NUMPARSE_COMPOSITIONS__
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -9,12 +9,23 @@
#include "numparse_currency.h"
#include "ucurrimp.h"
#include "unicode/errorcode.h"
#include "numparse_utils.h"
using namespace icu;
using namespace icu::numparse;
using namespace icu::numparse::impl;
namespace {
inline void copyCurrencyCode(UChar* dest, const UChar* src) {
uprv_memcpy(dest, src, sizeof(UChar) * 3);
dest[3] = 0;
}
}
CurrencyNamesMatcher::CurrencyNamesMatcher(const Locale& locale, UErrorCode& status)
: fLocaleName(locale.getName(), -1, status) {}
@ -52,15 +63,84 @@ bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, U
return partialMatch;
}
const UnicodeSet* CurrencyNamesMatcher::getLeadCodePoints() const {
ErrorCode status;
UnicodeSet* leadCodePoints = new UnicodeSet();
uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status);
// Always apply case mapping closure for currencies
leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS);
leadCodePoints->freeze();
const UnicodeSet& CurrencyNamesMatcher::getLeadCodePoints() {
if (fLocalLeadCodePoints.isNull()) {
ErrorCode status;
auto* leadCodePoints = new UnicodeSet();
uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status);
// Always apply case mapping closure for currencies
leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS);
leadCodePoints->freeze();
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
}
return *fLocalLeadCodePoints;
}
return leadCodePoints;
CurrencyCustomMatcher::CurrencyCustomMatcher(const char16_t* currencyCode, const UnicodeString& currency1,
const UnicodeString& currency2)
: fCurrency1(currency1), fCurrency2(currency2) {
copyCurrencyCode(fCurrencyCode, currencyCode);
}
bool CurrencyCustomMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
if (result.currencyCode[0] != 0) {
return false;
}
int overlap1 = segment.getCommonPrefixLength(fCurrency1);
if (overlap1 == fCurrency1.length()) {
copyCurrencyCode(result.currencyCode, fCurrencyCode);
segment.adjustOffset(overlap1);
result.setCharsConsumed(segment);
}
int overlap2 = segment.getCommonPrefixLength(fCurrency2);
if (overlap2 == fCurrency2.length()) {
copyCurrencyCode(result.currencyCode, fCurrencyCode);
segment.adjustOffset(overlap2);
result.setCharsConsumed(segment);
}
return overlap1 == segment.length() || overlap2 == segment.length();
}
const UnicodeSet& CurrencyCustomMatcher::getLeadCodePoints() {
if (fLocalLeadCodePoints.isNull()) {
auto* leadCodePoints = new UnicodeSet();
utils::putLeadCodePoint(fCurrency1, leadCodePoints);
utils::putLeadCodePoint(fCurrency2, leadCodePoints);
leadCodePoints->freeze();
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
}
return *fLocalLeadCodePoints;
}
CurrencyAnyMatcher::CurrencyAnyMatcher(CurrencyNamesMatcher namesMatcher,
CurrencyCustomMatcher customMatcher)
: fNamesMatcher(std::move(namesMatcher)), fCustomMatcher(std::move(customMatcher)) {
fMatcherArray[0] = &fNamesMatcher;
fMatcherArray[1] = &fCustomMatcher;
}
const UnicodeSet& CurrencyAnyMatcher::getLeadCodePoints() {
if (fLocalLeadCodePoints.isNull()) {
auto* leadCodePoints = new UnicodeSet();
leadCodePoints->addAll(fNamesMatcher.getLeadCodePoints());
leadCodePoints->addAll(fCustomMatcher.getLeadCodePoints());
leadCodePoints->freeze();
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
}
return *fLocalLeadCodePoints;
}
const NumberParseMatcher* const* CurrencyAnyMatcher::begin() const {
return fMatcherArray;
}
const NumberParseMatcher* const* CurrencyAnyMatcher::end() const {
return fMatcherArray + 2;
}

View file

@ -8,6 +8,7 @@
#define __NUMPARSE_CURRENCY_H__
#include "numparse_types.h"
#include "numparse_compositions.h"
#include "charstr.h"
U_NAMESPACE_BEGIN namespace numparse {
@ -29,7 +30,7 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory {
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
const UnicodeSet* getLeadCodePoints() const override;
const UnicodeSet& getLeadCodePoints() override;
private:
// We could use Locale instead of CharString here, but
@ -39,6 +40,45 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory {
};
class CurrencyCustomMatcher : public NumberParseMatcher, public UMemory {
public:
CurrencyCustomMatcher(const char16_t* currencyCode, const UnicodeString& currency1,
const UnicodeString& currency2);
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
const UnicodeSet& getLeadCodePoints() override;
private:
UChar fCurrencyCode[4];
UnicodeString fCurrency1;
UnicodeString fCurrency2;
};
/**
* An implementation of AnyMatcher, allowing for either currency data or locale currency matches.
*/
class CurrencyAnyMatcher : public AnyMatcher, public UMemory {
public:
/** Calls std::move on the two arguments. */
CurrencyAnyMatcher(CurrencyNamesMatcher namesMatcher, CurrencyCustomMatcher customMatcher);
const UnicodeSet& getLeadCodePoints() override;
protected:
const NumberParseMatcher* const* begin() const override;
const NumberParseMatcher* const* end() const override;
private:
CurrencyNamesMatcher fNamesMatcher;
CurrencyCustomMatcher fCustomMatcher;
const NumberParseMatcher* fMatcherArray[2];
};
} // namespace impl
} // namespace numparse
U_NAMESPACE_END

View file

@ -291,22 +291,25 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
return segment.length() == 0 || hasPartialPrefix;
}
const UnicodeSet* DecimalMatcher::getLeadCodePoints() const {
const UnicodeSet& DecimalMatcher::getLeadCodePoints() {
if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
return new UnicodeSet(*leadSet);
return *leadSet;
}
auto* leadCodePoints = new UnicodeSet();
// Assumption: the sets are all single code points.
leadCodePoints->addAll(*unisets::get(unisets::DIGITS));
leadCodePoints->addAll(*separatorSet);
if (!fLocalDigitStrings.isNull()) {
for (int i = 0; i < 10; i++) {
utils::putLeadCodePoint(fLocalDigitStrings[i], leadCodePoints);
if (fLocalLeadCodePoints.isNull()) {
auto* leadCodePoints = new UnicodeSet();
// Assumption: the sets are all single code points.
leadCodePoints->addAll(*unisets::get(unisets::DIGITS));
leadCodePoints->addAll(*separatorSet);
if (!fLocalDigitStrings.isNull()) {
for (int i = 0; i < 10; i++) {
utils::putLeadCodePoint(fLocalDigitStrings[i], leadCodePoints);
}
}
leadCodePoints->freeze();
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
}
leadCodePoints->freeze();
return leadCodePoints;
return *fLocalLeadCodePoints;
}

View file

@ -27,7 +27,7 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory {
bool
match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign, UErrorCode& status) const;
const UnicodeSet* getLeadCodePoints() const override;
const UnicodeSet& getLeadCodePoints() override;
private:
/** If true, only accept strings whose grouping sizes match the locale */
@ -56,7 +56,7 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory {
const UnicodeSet* leadSet;
// Make this class the owner of a few objects that could be allocated.
// The first two LocalPointers are used for assigning ownership only.
// The first three LocalPointers are used for assigning ownership only.
LocalPointer<const UnicodeSet> fLocalDecimalUniSet;
LocalPointer<const UnicodeSet> fLocalSeparatorSet;
LocalArray<const UnicodeString> fLocalDigitStrings;

View file

@ -32,7 +32,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString&
auto* parser = new NumberParserImpl(parseFlags, true);
DecimalFormatSymbols symbols(locale, status);
parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES};
parser->fLocalMatchers.ignorables = std::move(IgnorablesMatcher(unisets::DEFAULT_IGNORABLES));
// MatcherFactory factory = new MatcherFactory();
// factory.currency = Currency.getInstance("USD");
@ -78,7 +78,7 @@ NumberParserImpl::~NumberParserImpl() {
fNumMatchers = 0;
}
void NumberParserImpl::addMatcher(const NumberParseMatcher& matcher) {
void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) {
if (fNumMatchers + 1 > fMatchers.getCapacity()) {
fMatchers.resize(fNumMatchers * 2, fNumMatchers);
if (fComputeLeads) {
@ -97,17 +97,17 @@ void NumberParserImpl::addMatcher(const NumberParseMatcher& matcher) {
fNumMatchers++;
}
void NumberParserImpl::addLeadCodePointsForMatcher(const NumberParseMatcher& matcher) {
const UnicodeSet* leadCodePoints = matcher.getLeadCodePoints();
void NumberParserImpl::addLeadCodePointsForMatcher(NumberParseMatcher& matcher) {
const UnicodeSet& leadCodePoints = matcher.getLeadCodePoints();
// TODO: Avoid the clone operation here.
if (0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)) {
UnicodeSet* copy = static_cast<UnicodeSet*>(leadCodePoints->cloneAsThawed());
delete leadCodePoints;
auto* copy = dynamic_cast<UnicodeSet*>(leadCodePoints.cloneAsThawed());
copy->closeOver(USET_ADD_CASE_MAPPINGS);
copy->freeze();
fLeads[fNumMatchers] = copy;
} else {
fLeads[fNumMatchers] = leadCodePoints;
// FIXME: new here because we still take ownership
fLeads[fNumMatchers] = new UnicodeSet(leadCodePoints);
}
}

View file

@ -24,7 +24,7 @@ class NumberParserImpl {
static NumberParserImpl* createSimpleParser(const Locale& locale, const UnicodeString& patternString,
parse_flags_t parseFlags, UErrorCode& status);
void addMatcher(const NumberParseMatcher& matcher);
void addMatcher(NumberParseMatcher& matcher);
void freeze();
@ -62,7 +62,7 @@ class NumberParserImpl {
NumberParserImpl(parse_flags_t parseFlags, bool computeLeads);
void addLeadCodePointsForMatcher(const NumberParseMatcher& matcher);
void addLeadCodePointsForMatcher(NumberParseMatcher& matcher);
void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;

View file

@ -67,17 +67,20 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr
return false;
}
const UnicodeSet* ScientificMatcher::getLeadCodePoints() const {
const UnicodeSet& ScientificMatcher::getLeadCodePoints() {
UChar32 leadCp = fExponentSeparatorString.char32At(0);
const UnicodeSet* s = unisets::get(unisets::SCIENTIFIC_LEAD);
if (s->contains(leadCp)) {
return new UnicodeSet(*s);
} else {
UnicodeSet* leadCodePoints = new UnicodeSet();
return *s;
}
if (fLocalLeadCodePoints.isNull()) {
auto* leadCodePoints = new UnicodeSet();
leadCodePoints->add(leadCp);
leadCodePoints->freeze();
return leadCodePoints;
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
}
return *fLocalLeadCodePoints;
}

View file

@ -25,7 +25,7 @@ class ScientificMatcher : public NumberParseMatcher, public UMemory {
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
const UnicodeSet* getLeadCodePoints() const override;
const UnicodeSet& getLeadCodePoints() override;
private:
UnicodeString fExponentSeparatorString;

View file

@ -54,17 +54,20 @@ bool SymbolMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCo
return overlap == segment.length();
}
const UnicodeSet* SymbolMatcher::getLeadCodePoints() const {
const UnicodeSet& SymbolMatcher::getLeadCodePoints() {
if (fString.isEmpty()) {
// Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
return new UnicodeSet(*fUniSet);
return *fUniSet;
}
UnicodeSet* leadCodePoints = new UnicodeSet();
utils::putLeadCodePoints(fUniSet, leadCodePoints);
utils::putLeadCodePoint(fString, leadCodePoints);
leadCodePoints->freeze();
return leadCodePoints;
if (fLocalLeadCodePoints.isNull()) {
auto* leadCodePoints = new UnicodeSet();
utils::putLeadCodePoints(fUniSet, leadCodePoints);
utils::putLeadCodePoint(fString, leadCodePoints);
leadCodePoints->freeze();
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
}
return *fLocalLeadCodePoints;
}
@ -86,7 +89,7 @@ void IgnorablesMatcher::accept(StringSegment&, ParsedNumber&) const {
InfinityMatcher::InfinityMatcher(const DecimalFormatSymbols& dfs)
: SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol), unisets::INFINITY) {
: SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol), unisets::INFINITY) {
}
bool InfinityMatcher::isDisabled(const ParsedNumber& result) const {
@ -118,15 +121,15 @@ NanMatcher::NanMatcher(const DecimalFormatSymbols& dfs)
: SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol), unisets::EMPTY) {
}
const UnicodeSet* NanMatcher::getLeadCodePoints() const {
const UnicodeSet& NanMatcher::getLeadCodePoints() {
// Overriding this here to allow use of statically allocated sets
int leadCp = fString.char32At(0);
const UnicodeSet* s = unisets::get(unisets::NAN_LEAD);
if (s->contains(leadCp)) {
return new UnicodeSet(*s);
} else {
return SymbolMatcher::getLeadCodePoints();
return *s;
}
return SymbolMatcher::getLeadCodePoints();
}
bool NanMatcher::isDisabled(const ParsedNumber& result) const {
@ -146,11 +149,11 @@ bool PaddingMatcher::isFlexible() const {
return true;
}
bool PaddingMatcher::isDisabled(const ParsedNumber& result) const {
bool PaddingMatcher::isDisabled(const ParsedNumber&) const {
return false;
}
void PaddingMatcher::accept(StringSegment& segment, ParsedNumber& result) const {
void PaddingMatcher::accept(StringSegment&, ParsedNumber&) const {
// No-op
}

View file

@ -28,7 +28,8 @@ class SymbolMatcher : public NumberParseMatcher, public UMemory {
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
const UnicodeSet* getLeadCodePoints() const override;
/** NOTE: This method is not guaranteed to be thread-safe. */
const UnicodeSet& getLeadCodePoints() override;
virtual bool isDisabled(const ParsedNumber& result) const = 0;
@ -92,7 +93,7 @@ class NanMatcher : public SymbolMatcher {
NanMatcher(const DecimalFormatSymbols& dfs);
const UnicodeSet* getLeadCodePoints() const override;
const UnicodeSet& getLeadCodePoints() override;
protected:
bool isDisabled(const ParsedNumber& result) const override;

View file

@ -244,8 +244,6 @@ class StringSegment : public UMemory, public ::icu::number::impl::CharSequence {
*/
class NumberParseMatcher {
public:
virtual ~NumberParseMatcher() = default;
/**
* Matchers can override this method to return true to indicate that they are optional and can be run
* repeatedly. Used by SeriesMatcher, primarily in the context of IgnorablesMatcher.
@ -259,6 +257,8 @@ class NumberParseMatcher {
* something interesting in the StringSegment, it should update the offset of the StringSegment
* corresponding to how many chars were matched.
*
* This method is thread-safe.
*
* @param segment
* The StringSegment to match against. Matches always start at the beginning of the
* segment. The segment is guaranteed to contain at least one char.
@ -275,9 +275,12 @@ class NumberParseMatcher {
* return value is used to skip this matcher unless a segment begins with a char in this set. To make
* this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}.
*
* The returned UnicodeSet needs adoption!
* The returned UnicodeSet does not need adoption and is guaranteed to be alive for as long as the
* object that returned it.
*
* This method is NOT thread-safe.
*/
virtual const UnicodeSet* getLeadCodePoints() const = 0;
virtual const UnicodeSet& getLeadCodePoints() = 0;
/**
* Method called at the end of a parse, after all matchers have failed to consume any more chars.
@ -290,6 +293,13 @@ class NumberParseMatcher {
virtual void postProcess(ParsedNumber&) const {
// Default implementation: no-op
};
protected:
// No construction except by subclasses!
NumberParseMatcher() = default;
// Optional ownership of the leadCodePoints set
LocalPointer<const UnicodeSet> fLocalLeadCodePoints;
};

View file

@ -238,6 +238,12 @@ UnicodeString toString(UBool b) {
return b ? UnicodeString("TRUE"):UnicodeString("FALSE");
}
UnicodeString toString(const UnicodeSet& uniset, UErrorCode& status) {
UnicodeString result;
uniset.toPattern(result, status);
return result;
}
// stephen - cleaned up 05/05/99
UnicodeString operator+(const UnicodeString& left, char num)
{ return left + (long)num; }
@ -2050,6 +2056,24 @@ UBool IntlTest::assertEquals(const char* message,
return TRUE;
}
UBool IntlTest::assertEquals(const char* message,
const UnicodeSet& expected,
const UnicodeSet& actual) {
IcuTestErrorCode status(*this, "assertEqualsUniSet");
if (expected != actual) {
errln((UnicodeString)"FAIL: " + message + "; got " +
toString(actual, status) +
"; expected " + toString(expected, status));
return FALSE;
}
#ifdef VERBOSE_ASSERTIONS
else {
logln((UnicodeString)"Ok: " + message + "; got " + toString(actual, status));
}
#endif
return TRUE;
}
#if !UCONFIG_NO_FORMATTING
UBool IntlTest::assertEquals(const char* message,
@ -2136,6 +2160,11 @@ UBool IntlTest::assertEquals(const UnicodeString& message,
UErrorCode actual) {
return assertEquals(extractToAssertBuf(message), expected, actual);
}
UBool IntlTest::assertEquals(const UnicodeString& message,
const UnicodeSet& expected,
const UnicodeSet& actual) {
return assertEquals(extractToAssertBuf(message), expected, actual);
}
#if !UCONFIG_NO_FORMATTING
UBool IntlTest::assertEquals(const UnicodeString& message,

View file

@ -16,6 +16,7 @@
// The following includes utypes.h, uobject.h and unistr.h
#include "unicode/fmtable.h"
#include "unicode/testlog.h"
#include "unicode/uniset.h"
U_NAMESPACE_USE
@ -295,6 +296,7 @@ public:
UBool assertEquals(const char* message, int64_t expected, int64_t actual);
UBool assertEquals(const char* message, double expected, double actual);
UBool assertEquals(const char* message, UErrorCode expected, UErrorCode actual);
UBool assertEquals(const char* message, const UnicodeSet& expected, const UnicodeSet& actual);
#if !UCONFIG_NO_FORMATTING
UBool assertEquals(const char* message, const Formattable& expected,
const Formattable& actual, UBool possibleDataError=FALSE);
@ -312,6 +314,7 @@ public:
UBool assertEquals(const UnicodeString& message, int64_t expected, int64_t actual);
UBool assertEquals(const UnicodeString& message, double expected, double actual);
UBool assertEquals(const UnicodeString& message, UErrorCode expected, UErrorCode actual);
UBool assertEquals(const UnicodeString& message, const UnicodeSet& expected, const UnicodeSet& actual);
virtual void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); // overide !

View file

@ -212,6 +212,7 @@ class NumberParserTest : public IntlTest {
void testBasic();
void testLocaleFi();
void testSeriesMatcher();
void testCurrencyAnyMatcher();
void testGroupingDisabled();
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par = 0);

View file

@ -21,6 +21,7 @@ void NumberParserTest::runIndexedTest(int32_t index, UBool exec, const char*& na
}
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(testBasic);
TESTCASE_AUTO(testSeriesMatcher);
TESTCASE_AUTO_END;
}
@ -99,7 +100,7 @@ void NumberParserTest::testBasic() {
{3, u"0", u"0", 1, 0.0}};
parse_flags_t parseFlags = PARSE_FLAG_IGNORE_CASE | PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
for (auto cas : cases) {
for (auto& cas : cases) {
UnicodeString inputString(cas.inputString);
UnicodeString patternString(cas.patternString);
LocalPointer<const NumberParserImpl> parser(
@ -153,5 +154,54 @@ void NumberParserTest::testBasic() {
}
}
void NumberParserTest::testSeriesMatcher() {
IcuTestErrorCode status(*this, "testSeriesMatcher");
DecimalFormatSymbols symbols("en", status);
PlusSignMatcher m0(symbols, false);
MinusSignMatcher m1(symbols, false);
IgnorablesMatcher m2(unisets::DEFAULT_IGNORABLES);
PercentMatcher m3(symbols);
IgnorablesMatcher m4(unisets::DEFAULT_IGNORABLES);
ArraySeriesMatcher series(new NumberParseMatcher* [5]{&m0, &m1, &m2, &m3, &m4}, 5);
assertEquals(
"Lead set should be equal to lead set of lead matcher",
*unisets::get(unisets::PLUS_SIGN),
series.getLeadCodePoints());
static const struct TestCase {
const char16_t* input;
int32_t expectedOffset;
bool expectedMaybeMore;
} cases[] = {{u"", 0, true},
{u" ", 0, false},
{u"$", 0, false},
{u"+", 0, true},
{u" +", 0, false},
{u"+-", 0, true},
{u"+ -", 0, false},
{u"+- ", 0, true},
{u"+- $", 0, false},
{u"+-%", 3, true},
{u" +- % ", 0, false},
{u"+- % ", 7, true},
{u"+-%$", 3, false}};
for (auto& cas : cases) {
UnicodeString input(cas.input);
StringSegment segment(input, 0);
ParsedNumber result;
bool actualMaybeMore = series.match(segment, result, status);
int actualOffset = segment.getOffset();
assertEquals("'" + input + "'", cas.expectedOffset, actualOffset);
assertEquals("'" + input + "'", cas.expectedMaybeMore, actualMaybeMore);
}
}
#endif

View file

@ -9,19 +9,19 @@ import com.ibm.icu.util.ULocale;
/**
* A matcher for a single currency instance (not the full trie).
*/
public class CurrencyMatcher implements NumberParseMatcher {
public class CurrencyCustomMatcher implements NumberParseMatcher {
private final String isoCode;
private final String currency1;
private final String currency2;
public static CurrencyMatcher getInstance(Currency currency, ULocale loc) {
return new CurrencyMatcher(currency.getSubtype(),
public static CurrencyCustomMatcher getInstance(Currency currency, ULocale loc) {
return new CurrencyCustomMatcher(currency.getSubtype(),
currency.getSymbol(loc),
currency.getCurrencyCode());
}
private CurrencyMatcher(String isoCode, String currency1, String currency2) {
private CurrencyCustomMatcher(String isoCode, String currency1, String currency2) {
this.isoCode = isoCode;
this.currency1 = currency1;
this.currency2 = currency2;

View file

@ -11,21 +11,24 @@ import com.ibm.icu.util.Currency.CurrencyStringInfo;
import com.ibm.icu.util.ULocale;
/**
* @author sffc
* Matches currencies according to all available strings in locale data.
*
* The implementation of this class is different between J and C. See #13584 for a follow-up.
*
* @author sffc
*/
public class CurrencyTrieMatcher implements NumberParseMatcher {
public class CurrencyNamesMatcher implements NumberParseMatcher {
private final TextTrieMap<CurrencyStringInfo> longNameTrie;
private final TextTrieMap<CurrencyStringInfo> symbolTrie;
public static CurrencyTrieMatcher getInstance(ULocale locale) {
public static CurrencyNamesMatcher getInstance(ULocale locale) {
// TODO: Pre-compute some of the more popular locales?
return new CurrencyTrieMatcher(locale);
return new CurrencyNamesMatcher(locale);
}
private CurrencyTrieMatcher(ULocale locale) {
// TODO: Currency trie does not currently have an option for case folding. It defaults to use
private CurrencyNamesMatcher(ULocale locale) {
// TODO: Currency trie does not currently have an option for case folding. It defaults to use
// case folding on long-names but not symbols.
longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME);
symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME);
@ -55,6 +58,8 @@ public class CurrencyTrieMatcher implements NumberParseMatcher {
UnicodeSet leadCodePoints = new UnicodeSet();
longNameTrie.putLeadCodePoints(leadCodePoints);
symbolTrie.putLeadCodePoints(leadCodePoints);
// Always apply case mapping closure for currencies
leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
return leadCodePoints.freeze();
}

View file

@ -7,14 +7,15 @@ import com.ibm.icu.util.Currency;
import com.ibm.icu.util.ULocale;
/**
* @author sffc
* Small helper class that generates matchers for SeriesMatcher.
*
* @author sffc
*/
public class MatcherFactory {
Currency currency;
DecimalFormatSymbols symbols;
IgnorablesMatcher ignorables;
ULocale locale;
public Currency currency;
public DecimalFormatSymbols symbols;
public IgnorablesMatcher ignorables;
public ULocale locale;
public MinusSignMatcher minusSign(boolean allowTrailing) {
return MinusSignMatcher.getInstance(symbols, allowTrailing);
@ -34,8 +35,8 @@ public class MatcherFactory {
public AnyMatcher currency() {
AnyMatcher any = new AnyMatcher();
any.addMatcher(CurrencyMatcher.getInstance(currency, locale));
any.addMatcher(CurrencyTrieMatcher.getInstance(locale));
any.addMatcher(CurrencyCustomMatcher.getInstance(currency, locale));
any.addMatcher(CurrencyNamesMatcher.getInstance(locale));
any.freeze();
return any;
}

View file

@ -95,7 +95,7 @@ public class NumberParserImpl {
parser.addMatcher(InfinityMatcher.getInstance(symbols));
parser.addMatcher(PaddingMatcher.getInstance("@"));
parser.addMatcher(ScientificMatcher.getInstance(symbols, grouper));
parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
parser.addMatcher(CurrencyNamesMatcher.getInstance(locale));
parser.addMatcher(new RequireNumberMatcher());
parser.freeze();
@ -213,8 +213,8 @@ public class NumberParserImpl {
////////////////////////
if (parseCurrency || patternInfo.hasCurrencySign()) {
parser.addMatcher(CurrencyMatcher.getInstance(currency, locale));
parser.addMatcher(CurrencyTrieMatcher.getInstance(locale));
parser.addMatcher(CurrencyCustomMatcher.getInstance(currency, locale));
parser.addMatcher(CurrencyNamesMatcher.getInstance(locale));
}
///////////////////////////////

View file

@ -7,8 +7,11 @@ import static org.junit.Assert.assertTrue;
import org.junit.Test;
import com.ibm.icu.impl.number.CustomSymbolCurrency;
import com.ibm.icu.impl.number.DecimalFormatProperties;
import com.ibm.icu.impl.number.parse.AnyMatcher;
import com.ibm.icu.impl.number.parse.IgnorablesMatcher;
import com.ibm.icu.impl.number.parse.MatcherFactory;
import com.ibm.icu.impl.number.parse.MinusSignMatcher;
import com.ibm.icu.impl.number.parse.NumberParserImpl;
import com.ibm.icu.impl.number.parse.ParsedNumber;
@ -222,6 +225,38 @@ public class NumberParserTest {
}
}
@Test
public void testCurrencyAnyMatcher() {
MatcherFactory factory = new MatcherFactory();
factory.locale = ULocale.ENGLISH;
CustomSymbolCurrency currency = new CustomSymbolCurrency("ICU", "IU$", "ICU");
factory.currency = currency;
AnyMatcher matcher = factory.currency();
Object[][] cases = new Object[][] {
{ "", null },
{ "FOO", null },
{ "USD", "USD" },
{ "$", "USD" },
{ "US dollars", "USD" },
{ "eu", null },
{ "euros", "EUR" },
{ "ICU", "ICU" },
{ "IU$", "ICU" } };
for (Object[] cas : cases) {
String input = (String) cas[0];
String expectedCurrencyCode = (String) cas[1];
StringSegment segment = new StringSegment(input, 0);
ParsedNumber result = new ParsedNumber();
matcher.match(segment, result);
assertEquals("Parsing " + input, expectedCurrencyCode, result.currencyCode);
assertEquals("Whole string on " + input,
expectedCurrencyCode == null ? 0 : input.length(),
result.charEnd);
}
}
@Test
public void testGroupingDisabled() {
DecimalFormatProperties properties = new DecimalFormatProperties();