mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 01:11:02 +00:00
ICU-13634 Changes NumberParseMatcher getLeadCodePoints() to smokeTest() in C++ and Java. The new method is more versatile and eliminates the requirement to maintain two code paths for "lead chars" and "no lead chars".
X-SVN-Rev: 41131
This commit is contained in:
parent
8b4c367468
commit
01916cad11
40 changed files with 266 additions and 408 deletions
|
@ -206,21 +206,15 @@ CodePointMatcher::CodePointMatcher(UChar32 cp)
|
|||
: fCp(cp) {}
|
||||
|
||||
bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
|
||||
if (segment.matches(fCp)) {
|
||||
if (segment.startsWith(fCp)) {
|
||||
segment.adjustOffsetByCodePoint();
|
||||
result.setCharsConsumed(segment);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
const UnicodeSet& CodePointMatcher::getLeadCodePoints() {
|
||||
if (fLocalLeadCodePoints.isNull()) {
|
||||
auto* leadCodePoints = new UnicodeSet();
|
||||
leadCodePoints->add(fCp);
|
||||
leadCodePoints->freeze();
|
||||
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
|
||||
}
|
||||
return *fLocalLeadCodePoints;
|
||||
bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
|
||||
return segment.startsWith(fCp);
|
||||
}
|
||||
|
||||
UnicodeString CodePointMatcher::toString() const {
|
||||
|
@ -427,19 +421,9 @@ bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCod
|
|||
}
|
||||
}
|
||||
|
||||
const UnicodeSet& AffixMatcher::getLeadCodePoints() {
|
||||
if (fLocalLeadCodePoints.isNull()) {
|
||||
auto* leadCodePoints = new UnicodeSet();
|
||||
if (fPrefix != nullptr) {
|
||||
leadCodePoints->addAll(fPrefix->getLeadCodePoints());
|
||||
}
|
||||
if (fSuffix != nullptr) {
|
||||
leadCodePoints->addAll(fSuffix->getLeadCodePoints());
|
||||
}
|
||||
leadCodePoints->freeze();
|
||||
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
|
||||
}
|
||||
return *fLocalLeadCodePoints;
|
||||
bool AffixMatcher::smokeTest(const StringSegment& segment) const {
|
||||
return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
|
||||
(fSuffix != nullptr && fSuffix->smokeTest(segment));
|
||||
}
|
||||
|
||||
void AffixMatcher::postProcess(ParsedNumber& result) const {
|
||||
|
|
|
@ -35,7 +35,7 @@ class CodePointMatcher : public NumberParseMatcher, public UMemory {
|
|||
|
||||
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
UnicodeString toString() const override;
|
||||
|
||||
|
@ -190,7 +190,7 @@ class AffixMatcher : public NumberParseMatcher, public UMemory {
|
|||
|
||||
void postProcess(ParsedNumber& result) const override;
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
int8_t compareTo(const AffixMatcher& rhs) const;
|
||||
|
||||
|
|
|
@ -38,9 +38,19 @@ bool AnyMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&
|
|||
return maybeMore;
|
||||
}
|
||||
|
||||
bool AnyMatcher::smokeTest(const StringSegment& segment) const {
|
||||
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
|
||||
for (auto& matcher : *this) {
|
||||
if (matcher->smokeTest(segment)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void AnyMatcher::postProcess(ParsedNumber& result) const {
|
||||
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
|
||||
for (auto* matcher : *this) {
|
||||
for (auto& matcher : *this) {
|
||||
matcher->postProcess(result);
|
||||
}
|
||||
}
|
||||
|
@ -83,6 +93,17 @@ bool SeriesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCo
|
|||
return maybeMore;
|
||||
}
|
||||
|
||||
bool SeriesMatcher::smokeTest(const StringSegment& segment) const {
|
||||
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
|
||||
// NOTE: We only want the first element. Use the for loop for boundary checking.
|
||||
for (auto& matcher : *this) {
|
||||
// SeriesMatchers are never allowed to start with a Flexible matcher.
|
||||
U_ASSERT(!matcher->isFlexible());
|
||||
return matcher->smokeTest(segment);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void SeriesMatcher::postProcess(ParsedNumber& result) const {
|
||||
// NOTE: The range-based for loop calls the virtual begin() and end() methods.
|
||||
for (auto* matcher : *this) {
|
||||
|
@ -99,12 +120,6 @@ ArraySeriesMatcher::ArraySeriesMatcher(MatcherArray& matchers, int32_t matchersL
|
|||
: fMatchers(std::move(matchers)), fMatchersLen(matchersLen) {
|
||||
}
|
||||
|
||||
const UnicodeSet& ArraySeriesMatcher::getLeadCodePoints() {
|
||||
// SeriesMatchers are never allowed to start with a Flexible matcher.
|
||||
U_ASSERT(!fMatchers[0]->isFlexible());
|
||||
return fMatchers[0]->getLeadCodePoints();
|
||||
}
|
||||
|
||||
int32_t ArraySeriesMatcher::length() const {
|
||||
return fMatchersLen;
|
||||
}
|
||||
|
|
|
@ -42,6 +42,8 @@ class AnyMatcher : public CompositionMatcher {
|
|||
public:
|
||||
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
|
||||
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
void postProcess(ParsedNumber& result) const override;
|
||||
|
||||
protected:
|
||||
|
@ -61,6 +63,8 @@ class SeriesMatcher : public CompositionMatcher {
|
|||
public:
|
||||
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
|
||||
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
void postProcess(ParsedNumber& result) const override;
|
||||
|
||||
virtual int32_t length() const = 0;
|
||||
|
@ -80,13 +84,11 @@ class ArraySeriesMatcher : public SeriesMatcher {
|
|||
public:
|
||||
ArraySeriesMatcher(); // WARNING: Leaves the object in an unusable state
|
||||
|
||||
typedef MaybeStackArray<NumberParseMatcher*, 3> MatcherArray;
|
||||
typedef MaybeStackArray<const NumberParseMatcher*, 3> MatcherArray;
|
||||
|
||||
/** The array is std::move'd */
|
||||
ArraySeriesMatcher(MatcherArray& matchers, int32_t matchersLen);
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
|
||||
UnicodeString toString() const override;
|
||||
|
||||
int32_t length() const override;
|
||||
|
|
|
@ -21,7 +21,12 @@ using namespace icu::numparse::impl;
|
|||
|
||||
|
||||
CurrencyNamesMatcher::CurrencyNamesMatcher(const Locale& locale, UErrorCode& status)
|
||||
: fLocaleName(locale.getName(), -1, status) {}
|
||||
: fLocaleName(locale.getName(), -1, status) {
|
||||
uprv_currencyLeads(fLocaleName.data(), fLeadCodePoints, status);
|
||||
// Always apply case mapping closure for currencies
|
||||
fLeadCodePoints.closeOver(USET_ADD_CASE_MAPPINGS);
|
||||
fLeadCodePoints.freeze();
|
||||
}
|
||||
|
||||
bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
|
||||
if (result.currencyCode[0] != 0) {
|
||||
|
@ -57,17 +62,8 @@ bool CurrencyNamesMatcher::match(StringSegment& segment, ParsedNumber& result, U
|
|||
return partialMatch;
|
||||
}
|
||||
|
||||
const UnicodeSet& CurrencyNamesMatcher::getLeadCodePoints() {
|
||||
if (fLocalLeadCodePoints.isNull()) {
|
||||
ErrorCode status;
|
||||
auto* leadCodePoints = new UnicodeSet();
|
||||
uprv_currencyLeads(fLocaleName.data(), *leadCodePoints, status);
|
||||
// Always apply case mapping closure for currencies
|
||||
leadCodePoints->closeOver(USET_ADD_CASE_MAPPINGS);
|
||||
leadCodePoints->freeze();
|
||||
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
|
||||
}
|
||||
return *fLocalLeadCodePoints;
|
||||
bool CurrencyNamesMatcher::smokeTest(const StringSegment& segment) const {
|
||||
return segment.startsWith(fLeadCodePoints);
|
||||
}
|
||||
|
||||
UnicodeString CurrencyNamesMatcher::toString() const {
|
||||
|
@ -103,15 +99,8 @@ bool CurrencyCustomMatcher::match(StringSegment& segment, ParsedNumber& result,
|
|||
return overlap1 == segment.length() || overlap2 == segment.length();
|
||||
}
|
||||
|
||||
const UnicodeSet& CurrencyCustomMatcher::getLeadCodePoints() {
|
||||
if (fLocalLeadCodePoints.isNull()) {
|
||||
auto* leadCodePoints = new UnicodeSet();
|
||||
utils::putLeadCodePoint(fCurrency1, leadCodePoints);
|
||||
utils::putLeadCodePoint(fCurrency2, leadCodePoints);
|
||||
leadCodePoints->freeze();
|
||||
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
|
||||
}
|
||||
return *fLocalLeadCodePoints;
|
||||
bool CurrencyCustomMatcher::smokeTest(const StringSegment& segment) const {
|
||||
return segment.startsWith(fCurrency1) || segment.startsWith(fCurrency2);
|
||||
}
|
||||
|
||||
UnicodeString CurrencyCustomMatcher::toString() const {
|
||||
|
@ -144,17 +133,6 @@ CurrencyAnyMatcher& CurrencyAnyMatcher::operator=(CurrencyAnyMatcher&& src) U_NO
|
|||
return *this;
|
||||
}
|
||||
|
||||
const UnicodeSet& CurrencyAnyMatcher::getLeadCodePoints() {
|
||||
if (fLocalLeadCodePoints.isNull()) {
|
||||
auto* leadCodePoints = new UnicodeSet();
|
||||
leadCodePoints->addAll(fNamesMatcher.getLeadCodePoints());
|
||||
leadCodePoints->addAll(fCustomMatcher.getLeadCodePoints());
|
||||
leadCodePoints->freeze();
|
||||
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
|
||||
}
|
||||
return *fLocalLeadCodePoints;
|
||||
}
|
||||
|
||||
const NumberParseMatcher* const* CurrencyAnyMatcher::begin() const {
|
||||
return fMatcherArray;
|
||||
}
|
||||
|
|
|
@ -11,6 +11,7 @@
|
|||
#include "numparse_compositions.h"
|
||||
#include "charstr.h"
|
||||
#include "number_currencysymbols.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
U_NAMESPACE_BEGIN namespace numparse {
|
||||
namespace impl {
|
||||
|
@ -32,7 +33,7 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory {
|
|||
|
||||
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
UnicodeString toString() const override;
|
||||
|
||||
|
@ -41,6 +42,7 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory {
|
|||
// Locale has a non-trivial default constructor.
|
||||
CharString fLocaleName;
|
||||
|
||||
UnicodeSet fLeadCodePoints;
|
||||
};
|
||||
|
||||
|
||||
|
@ -52,7 +54,7 @@ class CurrencyCustomMatcher : public NumberParseMatcher, public UMemory {
|
|||
|
||||
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
UnicodeString toString() const override;
|
||||
|
||||
|
@ -78,8 +80,6 @@ class CurrencyAnyMatcher : public AnyMatcher, public UMemory {
|
|||
|
||||
CurrencyAnyMatcher& operator=(CurrencyAnyMatcher&& src) U_NOEXCEPT;
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
|
||||
UnicodeString toString() const override;
|
||||
|
||||
protected:
|
||||
|
|
|
@ -295,25 +295,23 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
|
|||
return segment.length() == 0 || hasPartialPrefix;
|
||||
}
|
||||
|
||||
const UnicodeSet& DecimalMatcher::getLeadCodePoints() {
|
||||
bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
|
||||
// The common case uses a static leadSet for efficiency.
|
||||
if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
|
||||
return *leadSet;
|
||||
return segment.startsWith(*leadSet);
|
||||
}
|
||||
|
||||
if (fLocalLeadCodePoints.isNull()) {
|
||||
auto* leadCodePoints = new UnicodeSet();
|
||||
// Assumption: the sets are all single code points.
|
||||
leadCodePoints->addAll(*unisets::get(unisets::DIGITS));
|
||||
leadCodePoints->addAll(*separatorSet);
|
||||
if (!fLocalDigitStrings.isNull()) {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
utils::putLeadCodePoint(fLocalDigitStrings[i], leadCodePoints);
|
||||
}
|
||||
if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
|
||||
return true;
|
||||
}
|
||||
if (fLocalDigitStrings.isNull()) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < 10; i++) {
|
||||
if (segment.startsWith(fLocalDigitStrings[i])) {
|
||||
return true;
|
||||
}
|
||||
leadCodePoints->freeze();
|
||||
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
|
||||
}
|
||||
return *fLocalLeadCodePoints;
|
||||
return false;
|
||||
}
|
||||
|
||||
UnicodeString DecimalMatcher::toString() const {
|
||||
|
|
|
@ -27,7 +27,7 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory {
|
|||
bool
|
||||
match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign, UErrorCode& status) const;
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
UnicodeString toString() const override;
|
||||
|
||||
|
|
|
@ -34,7 +34,7 @@ NumberParserImpl*
|
|||
NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString& patternString,
|
||||
parse_flags_t parseFlags, UErrorCode& status) {
|
||||
|
||||
LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags, true));
|
||||
LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
|
||||
DecimalFormatSymbols symbols(locale, status);
|
||||
|
||||
parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES};
|
||||
|
@ -117,7 +117,7 @@ NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatPr
|
|||
}
|
||||
IgnorablesMatcher ignorables(isStrict ? unisets::DEFAULT_IGNORABLES : unisets::STRICT_IGNORABLES);
|
||||
|
||||
LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags, status));
|
||||
LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
|
||||
|
||||
//////////////////////
|
||||
/// AFFIX MATCHERS ///
|
||||
|
@ -197,52 +197,22 @@ NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatPr
|
|||
return parser.orphan();
|
||||
}
|
||||
|
||||
NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags, bool computeLeads)
|
||||
: fParseFlags(parseFlags), fComputeLeads(computeLeads) {
|
||||
NumberParserImpl::NumberParserImpl(parse_flags_t parseFlags)
|
||||
: fParseFlags(parseFlags) {
|
||||
}
|
||||
|
||||
NumberParserImpl::~NumberParserImpl() {
|
||||
if (fComputeLeads) {
|
||||
for (int32_t i = 0; i < fNumMatchers; i++) {
|
||||
delete (fLeads[i]);
|
||||
}
|
||||
}
|
||||
fNumMatchers = 0;
|
||||
}
|
||||
|
||||
void NumberParserImpl::addMatcher(NumberParseMatcher& matcher) {
|
||||
if (fNumMatchers + 1 > fMatchers.getCapacity()) {
|
||||
fMatchers.resize(fNumMatchers * 2, fNumMatchers);
|
||||
if (fComputeLeads) {
|
||||
// The two arrays should grow in tandem:
|
||||
U_ASSERT(fNumMatchers >= fLeads.getCapacity());
|
||||
fLeads.resize(fNumMatchers * 2, fNumMatchers);
|
||||
}
|
||||
}
|
||||
|
||||
fMatchers[fNumMatchers] = &matcher;
|
||||
|
||||
if (fComputeLeads) {
|
||||
addLeadCodePointsForMatcher(matcher);
|
||||
}
|
||||
|
||||
fNumMatchers++;
|
||||
}
|
||||
|
||||
void NumberParserImpl::addLeadCodePointsForMatcher(NumberParseMatcher& matcher) {
|
||||
const UnicodeSet& leadCodePoints = matcher.getLeadCodePoints();
|
||||
// TODO: Avoid the clone operation here.
|
||||
if (0 != (fParseFlags & PARSE_FLAG_IGNORE_CASE)) {
|
||||
auto* copy = dynamic_cast<UnicodeSet*>(leadCodePoints.cloneAsThawed());
|
||||
copy->closeOver(USET_ADD_CASE_MAPPINGS);
|
||||
copy->freeze();
|
||||
fLeads[fNumMatchers] = copy;
|
||||
} else {
|
||||
// FIXME: new here because we still take ownership
|
||||
fLeads[fNumMatchers] = new UnicodeSet(leadCodePoints);
|
||||
}
|
||||
}
|
||||
|
||||
void NumberParserImpl::freeze() {
|
||||
fFrozen = true;
|
||||
}
|
||||
|
@ -276,12 +246,11 @@ void NumberParserImpl::parseGreedyRecursive(StringSegment& segment, ParsedNumber
|
|||
}
|
||||
|
||||
int initialOffset = segment.getOffset();
|
||||
int leadCp = segment.getCodePoint();
|
||||
for (int32_t i = 0; i < fNumMatchers; i++) {
|
||||
if (fComputeLeads && !fLeads[i]->contains(leadCp)) {
|
||||
const NumberParseMatcher* matcher = fMatchers[i];
|
||||
if (!matcher->smokeTest(segment)) {
|
||||
continue;
|
||||
}
|
||||
const NumberParseMatcher* matcher = fMatchers[i];
|
||||
matcher->match(segment, result, status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
|
@ -313,8 +282,10 @@ void NumberParserImpl::parseLongestRecursive(StringSegment& segment, ParsedNumbe
|
|||
|
||||
int initialOffset = segment.getOffset();
|
||||
for (int32_t i = 0; i < fNumMatchers; i++) {
|
||||
// TODO: Check leadChars here?
|
||||
const NumberParseMatcher* matcher = fMatchers[i];
|
||||
if (!matcher->smokeTest(segment)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// In a non-greedy parse, we attempt all possible matches and pick the best.
|
||||
for (int32_t charsToConsume = 0; charsToConsume < segment.length();) {
|
||||
|
|
|
@ -32,6 +32,11 @@ class NumberParserImpl : public MutableMatcherCollection {
|
|||
const number::impl::DecimalFormatProperties& properties, const DecimalFormatSymbols& symbols,
|
||||
bool parseCurrency, bool optimize, UErrorCode& status);
|
||||
|
||||
/**
|
||||
* Does NOT take ownership of the matcher. The matcher MUST remain valid for the lifespan of the
|
||||
* NumberParserImpl.
|
||||
* @param matcher The matcher to reference.
|
||||
*/
|
||||
void addMatcher(NumberParseMatcher& matcher) override;
|
||||
|
||||
void freeze();
|
||||
|
@ -48,8 +53,6 @@ class NumberParserImpl : public MutableMatcherCollection {
|
|||
int32_t fNumMatchers = 0;
|
||||
// NOTE: The stack capacity for fMatchers and fLeads should be the same
|
||||
MaybeStackArray<const NumberParseMatcher*, 10> fMatchers;
|
||||
MaybeStackArray<const UnicodeSet*, 10> fLeads;
|
||||
bool fComputeLeads;
|
||||
bool fFrozen = false;
|
||||
|
||||
// WARNING: All of these matchers start in an undefined state (default-constructed).
|
||||
|
@ -78,9 +81,7 @@ class NumberParserImpl : public MutableMatcherCollection {
|
|||
RequireNumberValidator number;
|
||||
} fLocalValidators;
|
||||
|
||||
NumberParserImpl(parse_flags_t parseFlags, bool computeLeads);
|
||||
|
||||
void addLeadCodePointsForMatcher(NumberParseMatcher& matcher);
|
||||
explicit NumberParserImpl(parse_flags_t parseFlags);
|
||||
|
||||
void parseGreedyRecursive(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const;
|
||||
|
||||
|
|
|
@ -44,10 +44,10 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr
|
|||
|
||||
// Allow a sign, and then try to match digits.
|
||||
int8_t exponentSign = 1;
|
||||
if (segment.matches(*unisets::get(unisets::MINUS_SIGN))) {
|
||||
if (segment.startsWith(*unisets::get(unisets::MINUS_SIGN))) {
|
||||
exponentSign = -1;
|
||||
segment.adjustOffsetByCodePoint();
|
||||
} else if (segment.matches(*unisets::get(unisets::PLUS_SIGN))) {
|
||||
} else if (segment.startsWith(*unisets::get(unisets::PLUS_SIGN))) {
|
||||
segment.adjustOffsetByCodePoint();
|
||||
}
|
||||
|
||||
|
@ -71,20 +71,8 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr
|
|||
return false;
|
||||
}
|
||||
|
||||
const UnicodeSet& ScientificMatcher::getLeadCodePoints() {
|
||||
UChar32 leadCp = fExponentSeparatorString.char32At(0);
|
||||
const UnicodeSet* s = unisets::get(unisets::SCIENTIFIC_LEAD);
|
||||
if (s->contains(leadCp)) {
|
||||
return *s;
|
||||
}
|
||||
|
||||
if (fLocalLeadCodePoints.isNull()) {
|
||||
auto* leadCodePoints = new UnicodeSet();
|
||||
leadCodePoints->add(leadCp);
|
||||
leadCodePoints->freeze();
|
||||
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
|
||||
}
|
||||
return *fLocalLeadCodePoints;
|
||||
bool ScientificMatcher::smokeTest(const StringSegment& segment) const {
|
||||
return segment.startsWith(fExponentSeparatorString);
|
||||
}
|
||||
|
||||
UnicodeString ScientificMatcher::toString() const {
|
||||
|
|
|
@ -25,7 +25,7 @@ class ScientificMatcher : public NumberParseMatcher, public UMemory {
|
|||
|
||||
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
UnicodeString toString() const override;
|
||||
|
||||
|
|
|
@ -75,11 +75,11 @@ UChar32 StringSegment::getCodePoint() const {
|
|||
}
|
||||
}
|
||||
|
||||
bool StringSegment::matches(UChar32 otherCp) const {
|
||||
bool StringSegment::startsWith(UChar32 otherCp) const {
|
||||
return codePointsEqual(getCodePoint(), otherCp, fFoldCase);
|
||||
}
|
||||
|
||||
bool StringSegment::matches(const UnicodeSet& uniset) const {
|
||||
bool StringSegment::startsWith(const UnicodeSet& uniset) const {
|
||||
// TODO: Move UnicodeSet case-folding logic here.
|
||||
// TODO: Handle string matches here instead of separately.
|
||||
UChar32 cp = getCodePoint();
|
||||
|
@ -89,6 +89,15 @@ bool StringSegment::matches(const UnicodeSet& uniset) const {
|
|||
return uniset.contains(cp);
|
||||
}
|
||||
|
||||
bool StringSegment::startsWith(const UnicodeString& other) const {
|
||||
if (other.isBogus() || other.length() == 0 || length() == 0) {
|
||||
return false;
|
||||
}
|
||||
int cp1 = getCodePoint();
|
||||
int cp2 = other.char32At(0);
|
||||
return codePointsEqual(cp1, cp2, fFoldCase);
|
||||
}
|
||||
|
||||
int32_t StringSegment::getCommonPrefixLength(const UnicodeString& other) {
|
||||
return getPrefixLengthInternal(other, fFoldCase);
|
||||
}
|
||||
|
|
|
@ -58,20 +58,8 @@ bool SymbolMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCo
|
|||
return overlap == segment.length();
|
||||
}
|
||||
|
||||
const UnicodeSet& SymbolMatcher::getLeadCodePoints() {
|
||||
if (fString.isEmpty()) {
|
||||
// Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
|
||||
return *fUniSet;
|
||||
}
|
||||
|
||||
if (fLocalLeadCodePoints.isNull()) {
|
||||
auto* leadCodePoints = new UnicodeSet();
|
||||
utils::putLeadCodePoints(fUniSet, leadCodePoints);
|
||||
utils::putLeadCodePoint(fString, leadCodePoints);
|
||||
leadCodePoints->freeze();
|
||||
fLocalLeadCodePoints.adoptInstead(leadCodePoints);
|
||||
}
|
||||
return *fLocalLeadCodePoints;
|
||||
bool SymbolMatcher::smokeTest(const StringSegment& segment) const {
|
||||
return segment.startsWith(*fUniSet) || segment.startsWith(fString);
|
||||
}
|
||||
|
||||
UnicodeString SymbolMatcher::toString() const {
|
||||
|
@ -134,17 +122,6 @@ NanMatcher::NanMatcher(const DecimalFormatSymbols& dfs)
|
|||
: SymbolMatcher(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol), unisets::EMPTY) {
|
||||
}
|
||||
|
||||
const UnicodeSet& NanMatcher::getLeadCodePoints() {
|
||||
// Overriding this here to allow use of statically allocated sets
|
||||
int leadCp = fString.char32At(0);
|
||||
const UnicodeSet* s = unisets::get(unisets::NAN_LEAD);
|
||||
if (s->contains(leadCp)) {
|
||||
return *s;
|
||||
}
|
||||
|
||||
return SymbolMatcher::getLeadCodePoints();
|
||||
}
|
||||
|
||||
bool NanMatcher::isDisabled(const ParsedNumber& result) const {
|
||||
return result.seenNumber();
|
||||
}
|
||||
|
|
|
@ -28,7 +28,7 @@ class SymbolMatcher : public NumberParseMatcher, public UMemory {
|
|||
|
||||
bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const override;
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
bool smokeTest(const StringSegment& segment) const override;
|
||||
|
||||
UnicodeString toString() const override;
|
||||
|
||||
|
@ -96,8 +96,6 @@ class NanMatcher : public SymbolMatcher {
|
|||
|
||||
NanMatcher(const DecimalFormatSymbols& dfs);
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() override;
|
||||
|
||||
protected:
|
||||
bool isDisabled(const ParsedNumber& result) const override;
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ enum ResultFlags {
|
|||
FLAG_PERCENT = 0x0002,
|
||||
FLAG_PERMILLE = 0x0004,
|
||||
FLAG_HAS_EXPONENT = 0x0008,
|
||||
FLAG_HAS_DEFAULT_CURRENCY = 0x0010,
|
||||
// FLAG_HAS_DEFAULT_CURRENCY = 0x0010, // no longer used
|
||||
FLAG_HAS_DECIMAL_SEPARATOR = 0x0020,
|
||||
FLAG_NAN = 0x0040,
|
||||
FLAG_INFINITY = 0x0080,
|
||||
|
@ -46,6 +46,7 @@ enum ParseFlags {
|
|||
PARSE_FLAG_USE_FULL_AFFIXES = 0x0100,
|
||||
PARSE_FLAG_EXACT_AFFIX = 0x0200,
|
||||
PARSE_FLAG_PLUS_SIGN_ALLOWED = 0x0400,
|
||||
PARSE_FLAG_OPTIMIZE = 0x0800,
|
||||
};
|
||||
|
||||
|
||||
|
@ -216,12 +217,18 @@ class StringSegment : public UMemory, public ::icu::number::impl::CharSequence {
|
|||
* <p>
|
||||
* This method will perform case folding if case folding is enabled for the parser.
|
||||
*/
|
||||
bool matches(UChar32 otherCp) const;
|
||||
bool startsWith(UChar32 otherCp) const;
|
||||
|
||||
/**
|
||||
* Returns true if the first code point of this StringSegment is in the given UnicodeSet.
|
||||
*/
|
||||
bool matches(const UnicodeSet& uniset) const;
|
||||
bool startsWith(const UnicodeSet& uniset) const;
|
||||
|
||||
/**
|
||||
* Returns true if there is at least one code point of overlap between this StringSegment and the
|
||||
* given UnicodeString.
|
||||
*/
|
||||
bool startsWith(const UnicodeString& other) const;
|
||||
|
||||
/**
|
||||
* Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
|
||||
|
@ -294,17 +301,18 @@ class NumberParseMatcher {
|
|||
virtual bool match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const = 0;
|
||||
|
||||
/**
|
||||
* Should return a set representing all possible chars (UTF-16 code units) that could be the first
|
||||
* char that this matcher can consume. This method is only called during construction phase, and its
|
||||
* return value is used to skip this matcher unless a segment begins with a char in this set. To make
|
||||
* this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}.
|
||||
* Performs a fast "smoke check" for whether or not this matcher could possibly match against the
|
||||
* given string segment. The test should be as fast as possible but also as restrictive as possible.
|
||||
* For example, matchers can maintain a UnicodeSet of all code points that count possibly start a
|
||||
* match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly
|
||||
* handle case folding.
|
||||
*
|
||||
* The returned UnicodeSet does not need adoption and is guaranteed to be alive for as long as the
|
||||
* object that returned it.
|
||||
*
|
||||
* This method is NOT thread-safe.
|
||||
* @param segment
|
||||
* The segment to check against.
|
||||
* @return true if the matcher might be able to match against this segment; false if it definitely
|
||||
* will not be able to match.
|
||||
*/
|
||||
virtual const UnicodeSet& getLeadCodePoints() = 0;
|
||||
virtual bool smokeTest(const StringSegment& segment) const = 0;
|
||||
|
||||
/**
|
||||
* Method called at the end of a parse, after all matchers have failed to consume any more chars.
|
||||
|
@ -324,9 +332,6 @@ class NumberParseMatcher {
|
|||
protected:
|
||||
// No construction except by subclasses!
|
||||
NumberParseMatcher() = default;
|
||||
|
||||
// Optional ownership of the leadCodePoints set
|
||||
LocalPointer<const UnicodeSet> fLocalLeadCodePoints;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -93,9 +93,6 @@ void U_CALLCONV initNumberParseUniSets(UErrorCode& status) {
|
|||
gUnicodeSets[INFINITY_KEY] = new UnicodeSet(u"[∞]", status);
|
||||
|
||||
gUnicodeSets[DIGITS] = new UnicodeSet(u"[:digit:]", status);
|
||||
gUnicodeSets[NAN_LEAD] = new UnicodeSet(
|
||||
u"[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]", status);
|
||||
gUnicodeSets[SCIENTIFIC_LEAD] = new UnicodeSet(u"[Ee×·е\u0627]", status);
|
||||
gUnicodeSets[CWCF] = new UnicodeSet(u"[:CWCF:]", status);
|
||||
|
||||
gUnicodeSets[DIGITS_OR_ALL_SEPARATORS] = computeUnion(DIGITS, ALL_SEPARATORS);
|
||||
|
|
|
@ -47,8 +47,6 @@ enum Key {
|
|||
|
||||
// Other
|
||||
DIGITS,
|
||||
NAN_LEAD,
|
||||
SCIENTIFIC_LEAD,
|
||||
CWCF,
|
||||
|
||||
// Combined Separators with Digits (for lead code points)
|
||||
|
|
|
@ -21,12 +21,12 @@ class ValidationMatcher : public NumberParseMatcher {
|
|||
return false;
|
||||
}
|
||||
|
||||
const UnicodeSet& getLeadCodePoints() U_OVERRIDE {
|
||||
bool smokeTest(const StringSegment&) const U_OVERRIDE {
|
||||
// No-op
|
||||
return *unisets::get(unisets::EMPTY);
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual void postProcess(ParsedNumber& result) const U_OVERRIDE = 0;
|
||||
void postProcess(ParsedNumber& result) const U_OVERRIDE = 0;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -176,10 +176,9 @@ void NumberParserTest::testSeriesMatcher() {
|
|||
matchers[4] = &m4;
|
||||
ArraySeriesMatcher series(matchers, 5);
|
||||
|
||||
assertEquals(
|
||||
"Lead set should be equal to lead set of lead matcher",
|
||||
*unisets::get(unisets::PLUS_SIGN),
|
||||
series.getLeadCodePoints());
|
||||
assertFalse("", series.smokeTest(StringSegment(u"x", false)));
|
||||
assertFalse("", series.smokeTest(StringSegment(u"-", false)));
|
||||
assertTrue("", series.smokeTest(StringSegment(u"+", false)));
|
||||
|
||||
static const struct TestCase {
|
||||
const char16_t* input;
|
||||
|
|
|
@ -46,8 +46,6 @@ void UniSetsTest::testSetCoverage() {
|
|||
const UnicodeSet &percent = *get(unisets::PERCENT_SIGN);
|
||||
const UnicodeSet &permille = *get(unisets::PERMILLE_SIGN);
|
||||
const UnicodeSet &infinity = *get(unisets::INFINITY_KEY);
|
||||
const UnicodeSet &nanLead = *get(unisets::NAN_LEAD);
|
||||
const UnicodeSet &scientificLead = *get(unisets::SCIENTIFIC_LEAD);
|
||||
|
||||
int32_t localeCount;
|
||||
const Locale* allAvailableLocales = Locale::getAvailableLocales(localeCount);
|
||||
|
@ -66,11 +64,6 @@ void UniSetsTest::testSetCoverage() {
|
|||
ASSERT_IN_SET(percent, dfs.getConstSymbol(DecimalFormatSymbols::kPercentSymbol));
|
||||
ASSERT_IN_SET(permille, dfs.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol));
|
||||
ASSERT_IN_SET(infinity, dfs.getConstSymbol(DecimalFormatSymbols::kInfinitySymbol));
|
||||
ASSERT_IN_SET(nanLead, dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0));
|
||||
ASSERT_IN_SET(nanLead,
|
||||
u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kNaNSymbol).char32At(0), 0));
|
||||
ASSERT_IN_SET(scientificLead,
|
||||
u_foldCase(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol).char32At(0), 0));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -134,6 +134,19 @@ public class StringSegment implements CharSequence {
|
|||
return uniset.contains(cp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if there is at least one code point of overlap between this StringSegment and the
|
||||
* given CharSequence. Null-safe.
|
||||
*/
|
||||
public boolean startsWith(CharSequence other) {
|
||||
if (other == null || other.length() == 0 || length() == 0) {
|
||||
return false;
|
||||
}
|
||||
int cp1 = Character.codePointAt(this, 0);
|
||||
int cp2 = Character.codePointAt(other, 0);
|
||||
return codePointsEqual(cp1, cp2, foldCase);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the length of the prefix shared by this StringSegment and the given CharSequence. For
|
||||
* example, if this string segment is "aab", and the char sequence is "aac", this method returns 2,
|
||||
|
|
|
@ -13,7 +13,6 @@ import com.ibm.icu.impl.number.AffixPatternProvider;
|
|||
import com.ibm.icu.impl.number.AffixUtils;
|
||||
import com.ibm.icu.impl.number.PatternStringUtils;
|
||||
import com.ibm.icu.number.NumberFormatter.SignDisplay;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author sffc
|
||||
|
@ -206,15 +205,9 @@ public class AffixMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
UnicodeSet leadCodePoints = new UnicodeSet();
|
||||
if (prefix != null) {
|
||||
leadCodePoints.addAll(prefix.getLeadCodePoints());
|
||||
}
|
||||
if (suffix != null) {
|
||||
leadCodePoints.addAll(suffix.getLeadCodePoints());
|
||||
}
|
||||
return leadCodePoints.freeze();
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
return (prefix != null && prefix.smokeTest(segment))
|
||||
|| (suffix != null && suffix.smokeTest(segment));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -6,7 +6,6 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import com.ibm.icu.impl.StringSegment;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Composes a number of matchers, and succeeds if any of the matchers succeed. Always greedily chooses
|
||||
|
@ -58,22 +57,18 @@ public class AnyMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
assert frozen;
|
||||
if (matchers == null) {
|
||||
return UnicodeSet.EMPTY;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (matchers.size() == 1) {
|
||||
return matchers.get(0).getLeadCodePoints();
|
||||
}
|
||||
|
||||
UnicodeSet leadCodePoints = new UnicodeSet();
|
||||
for (int i = 0; i < matchers.size(); i++) {
|
||||
NumberParseMatcher matcher = matchers.get(i);
|
||||
leadCodePoints.addAll(matcher.getLeadCodePoints());
|
||||
if (matchers.get(i).smokeTest(segment)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return leadCodePoints.freeze();
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
package com.ibm.icu.impl.number.parse;
|
||||
|
||||
import com.ibm.icu.impl.StringSegment;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Matches a single code point, performing no other logic.
|
||||
|
@ -33,8 +32,8 @@ public class CodePointMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
return new UnicodeSet().add(cp).freeze();
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
return segment.startsWith(cp);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
package com.ibm.icu.impl.number.parse;
|
||||
|
||||
import com.ibm.icu.impl.StringSegment;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.Currency;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
|
@ -52,11 +51,8 @@ public class CurrencyCustomMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
UnicodeSet leadCodePoints = new UnicodeSet();
|
||||
ParsingUtils.putLeadCodePoint(currency1, leadCodePoints);
|
||||
ParsingUtils.putLeadCodePoint(currency2, leadCodePoints);
|
||||
return leadCodePoints.freeze();
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
return segment.startsWith(currency1) || segment.startsWith(currency2);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -23,6 +23,8 @@ public class CurrencyNamesMatcher implements NumberParseMatcher {
|
|||
private final TextTrieMap<CurrencyStringInfo> longNameTrie;
|
||||
private final TextTrieMap<CurrencyStringInfo> symbolTrie;
|
||||
|
||||
private final UnicodeSet leadCodePoints;
|
||||
|
||||
public static CurrencyNamesMatcher getInstance(ULocale locale) {
|
||||
// TODO: Pre-compute some of the more popular locales?
|
||||
return new CurrencyNamesMatcher(locale);
|
||||
|
@ -33,6 +35,15 @@ public class CurrencyNamesMatcher implements NumberParseMatcher {
|
|||
// case folding on long-names but not symbols.
|
||||
longNameTrie = Currency.getParsingTrie(locale, Currency.LONG_NAME);
|
||||
symbolTrie = Currency.getParsingTrie(locale, Currency.SYMBOL_NAME);
|
||||
|
||||
// Compute the full set of characters that could be the first in a currency to allow for
|
||||
// efficient smoke test.
|
||||
leadCodePoints = new UnicodeSet();
|
||||
longNameTrie.putLeadCodePoints(leadCodePoints);
|
||||
symbolTrie.putLeadCodePoints(leadCodePoints);
|
||||
// Always apply case mapping closure for currencies
|
||||
leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
|
||||
leadCodePoints.freeze();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -55,13 +66,8 @@ public class CurrencyNamesMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
UnicodeSet leadCodePoints = new UnicodeSet();
|
||||
longNameTrie.putLeadCodePoints(leadCodePoints);
|
||||
symbolTrie.putLeadCodePoints(leadCodePoints);
|
||||
// Always apply case mapping closure for currencies
|
||||
leadCodePoints.closeOver(UnicodeSet.ADD_CASE_MAPPINGS);
|
||||
return leadCodePoints.freeze();
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
return segment.startsWith(leadCodePoints);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -329,21 +329,23 @@ public class DecimalMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
// The common case uses a static leadSet for efficiency.
|
||||
if (digitStrings == null && leadSet != null) {
|
||||
return leadSet;
|
||||
return segment.startsWith(leadSet);
|
||||
}
|
||||
|
||||
UnicodeSet leadCodePoints = new UnicodeSet();
|
||||
// Assumption: the sets are all single code points.
|
||||
leadCodePoints.addAll(UnicodeSetStaticCache.get(Key.DIGITS));
|
||||
leadCodePoints.addAll(separatorSet);
|
||||
if (digitStrings != null) {
|
||||
for (int i = 0; i < digitStrings.length; i++) {
|
||||
ParsingUtils.putLeadCodePoint(digitStrings[i], leadCodePoints);
|
||||
if (segment.startsWith(separatorSet) || UCharacter.isDigit(segment.getCodePoint())) {
|
||||
return true;
|
||||
}
|
||||
if (digitStrings == null) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < digitStrings.length; i++) {
|
||||
if (segment.startsWith(digitStrings[i])) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return leadCodePoints.freeze();
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -27,18 +27,6 @@ public class NanMatcher extends SymbolMatcher {
|
|||
super(symbolString, UnicodeSet.EMPTY);
|
||||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
// Overriding this here to allow use of statically allocated sets
|
||||
int leadCp = string.codePointAt(0);
|
||||
UnicodeSet s = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.NAN_LEAD);
|
||||
if (s.contains(leadCp)) {
|
||||
return s;
|
||||
} else {
|
||||
return super.getLeadCodePoints();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean isDisabled(ParsedNumber result) {
|
||||
return result.seenNumber();
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
package com.ibm.icu.impl.number.parse;
|
||||
|
||||
import com.ibm.icu.impl.StringSegment;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* The core interface implemented by all matchers used for number parsing.
|
||||
|
@ -45,12 +44,18 @@ public interface NumberParseMatcher {
|
|||
public boolean match(StringSegment segment, ParsedNumber result);
|
||||
|
||||
/**
|
||||
* Should return a set representing all possible chars (UTF-16 code units) that could be the first
|
||||
* char that this matcher can consume. This method is only called during construction phase, and its
|
||||
* return value is used to skip this matcher unless a segment begins with a char in this set. To make
|
||||
* this matcher always run, return {@link UnicodeSet#ALL_CODE_POINTS}.
|
||||
* Performs a fast "smoke check" for whether or not this matcher could possibly match against the
|
||||
* given string segment. The test should be as fast as possible but also as restrictive as possible.
|
||||
* For example, matchers can maintain a UnicodeSet of all code points that count possibly start a
|
||||
* match. Matchers should use the {@link StringSegment#startsWith} method in order to correctly
|
||||
* handle case folding.
|
||||
*
|
||||
* @param segment
|
||||
* The segment to check against.
|
||||
* @return true if the matcher might be able to match against this segment; false if it definitely
|
||||
* will not be able to match.
|
||||
*/
|
||||
public UnicodeSet getLeadCodePoints();
|
||||
public boolean smokeTest(StringSegment segment);
|
||||
|
||||
/**
|
||||
* Method called at the end of a parse, after all matchers have failed to consume any more chars.
|
||||
|
|
|
@ -19,7 +19,6 @@ import com.ibm.icu.impl.number.PropertiesAffixPatternProvider;
|
|||
import com.ibm.icu.impl.number.RoundingUtils;
|
||||
import com.ibm.icu.number.NumberFormatter.GroupingStrategy;
|
||||
import com.ibm.icu.text.DecimalFormatSymbols;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.Currency;
|
||||
import com.ibm.icu.util.CurrencyAmount;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
@ -250,7 +249,6 @@ public class NumberParserImpl {
|
|||
|
||||
private final int parseFlags;
|
||||
private final List<NumberParseMatcher> matchers;
|
||||
private final List<UnicodeSet> leads;
|
||||
private boolean frozen;
|
||||
|
||||
/**
|
||||
|
@ -261,11 +259,6 @@ public class NumberParserImpl {
|
|||
*/
|
||||
public NumberParserImpl(int parseFlags) {
|
||||
matchers = new ArrayList<NumberParseMatcher>();
|
||||
if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_OPTIMIZE)) {
|
||||
leads = new ArrayList<UnicodeSet>();
|
||||
} else {
|
||||
leads = null;
|
||||
}
|
||||
this.parseFlags = parseFlags;
|
||||
frozen = false;
|
||||
}
|
||||
|
@ -273,30 +266,11 @@ public class NumberParserImpl {
|
|||
public void addMatcher(NumberParseMatcher matcher) {
|
||||
assert !frozen;
|
||||
this.matchers.add(matcher);
|
||||
if (leads != null) {
|
||||
addLeadCodePointsForMatcher(matcher);
|
||||
}
|
||||
}
|
||||
|
||||
public void addMatchers(Collection<? extends NumberParseMatcher> matchers) {
|
||||
assert !frozen;
|
||||
this.matchers.addAll(matchers);
|
||||
if (leads != null) {
|
||||
for (NumberParseMatcher matcher : matchers) {
|
||||
addLeadCodePointsForMatcher(matcher);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void addLeadCodePointsForMatcher(NumberParseMatcher matcher) {
|
||||
UnicodeSet leadCodePoints = matcher.getLeadCodePoints();
|
||||
assert leadCodePoints.isFrozen();
|
||||
// TODO: Avoid the clone operation here.
|
||||
if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_IGNORE_CASE)) {
|
||||
leadCodePoints = leadCodePoints.cloneAsThawed().closeOver(UnicodeSet.ADD_CASE_MAPPINGS)
|
||||
.freeze();
|
||||
}
|
||||
this.leads.add(leadCodePoints);
|
||||
}
|
||||
|
||||
public void freeze() {
|
||||
|
@ -343,12 +317,11 @@ public class NumberParserImpl {
|
|||
}
|
||||
|
||||
int initialOffset = segment.getOffset();
|
||||
int leadCp = segment.getCodePoint();
|
||||
for (int i = 0; i < matchers.size(); i++) {
|
||||
if (leads != null && !leads.get(i).contains(leadCp)) {
|
||||
NumberParseMatcher matcher = matchers.get(i);
|
||||
if (!matcher.smokeTest(segment)) {
|
||||
continue;
|
||||
}
|
||||
NumberParseMatcher matcher = matchers.get(i);
|
||||
matcher.match(segment, result);
|
||||
if (segment.getOffset() != initialOffset) {
|
||||
// In a greedy parse, recurse on only the first match.
|
||||
|
@ -377,8 +350,10 @@ public class NumberParserImpl {
|
|||
|
||||
int initialOffset = segment.getOffset();
|
||||
for (int i = 0; i < matchers.size(); i++) {
|
||||
// TODO: Check leadChars here?
|
||||
NumberParseMatcher matcher = matchers.get(i);
|
||||
if (!matcher.smokeTest(segment)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// In a non-greedy parse, we attempt all possible matches and pick the best.
|
||||
for (int charsToConsume = 0; charsToConsume < segment.length();) {
|
||||
|
|
|
@ -5,7 +5,6 @@ package com.ibm.icu.impl.number.parse;
|
|||
import com.ibm.icu.impl.StringSegment;
|
||||
import com.ibm.icu.impl.number.Grouper;
|
||||
import com.ibm.icu.text.DecimalFormatSymbols;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* @author sffc
|
||||
|
@ -78,14 +77,8 @@ public class ScientificMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
int leadCp = exponentSeparatorString.codePointAt(0);
|
||||
UnicodeSet s = UnicodeSetStaticCache.get(UnicodeSetStaticCache.Key.SCIENTIFIC_LEAD);
|
||||
if (s.contains(leadCp)) {
|
||||
return s;
|
||||
} else {
|
||||
return new UnicodeSet().add(leadCp).freeze();
|
||||
}
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
return segment.startsWith(exponentSeparatorString);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -6,7 +6,6 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import com.ibm.icu.impl.StringSegment;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* Composes a number of matchers, running one after another. Matches the input string only if all of the
|
||||
|
@ -82,15 +81,15 @@ public class SeriesMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
assert frozen;
|
||||
if (matchers == null) {
|
||||
return UnicodeSet.EMPTY;
|
||||
return false;
|
||||
}
|
||||
|
||||
// SeriesMatchers are never allowed to start with a Flexible matcher.
|
||||
assert !(matchers.get(0) instanceof NumberParseMatcher.Flexible);
|
||||
return matchers.get(0).getLeadCodePoints();
|
||||
return matchers.get(0).smokeTest(segment);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -59,16 +59,8 @@ public abstract class SymbolMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
if (string.isEmpty()) {
|
||||
// Assumption: for sets from UnicodeSetStaticCache, uniSet == leadCodePoints.
|
||||
return uniSet;
|
||||
}
|
||||
|
||||
UnicodeSet leadCodePoints = new UnicodeSet();
|
||||
ParsingUtils.putLeadCodePoints(uniSet, leadCodePoints);
|
||||
ParsingUtils.putLeadCodePoint(string, leadCodePoints);
|
||||
return leadCodePoints.freeze();
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
return segment.startsWith(uniSet) || segment.startsWith(string);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -49,8 +49,6 @@ public class UnicodeSetStaticCache {
|
|||
|
||||
// Other
|
||||
DIGITS,
|
||||
NAN_LEAD,
|
||||
SCIENTIFIC_LEAD,
|
||||
CWCF, // TODO: Check if this is being used and remove it if not.
|
||||
|
||||
// Combined Separators with Digits (for lead code points)
|
||||
|
@ -112,10 +110,6 @@ public class UnicodeSetStaticCache {
|
|||
unicodeSets.put(Key.INFINITY, new UnicodeSet("[∞]").freeze());
|
||||
|
||||
unicodeSets.put(Key.DIGITS, new UnicodeSet("[:digit:]").freeze());
|
||||
unicodeSets.put(Key.NAN_LEAD,
|
||||
new UnicodeSet("[NnТтmeՈոс¤НнчTtsҳ\u975e\u1002\u0e9a\u10d0\u0f68\u0644\u0646]")
|
||||
.freeze());
|
||||
unicodeSets.put(Key.SCIENTIFIC_LEAD, new UnicodeSet("[Ee×·е\u0627]").freeze());
|
||||
unicodeSets.put(Key.CWCF, new UnicodeSet("[:CWCF:]").freeze());
|
||||
|
||||
unicodeSets.put(Key.DIGITS_OR_ALL_SEPARATORS, computeUnion(Key.DIGITS, Key.ALL_SEPARATORS));
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
package com.ibm.icu.impl.number.parse;
|
||||
|
||||
import com.ibm.icu.impl.StringSegment;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
|
||||
/**
|
||||
* A Matcher used only for post-process validation, not for consuming characters at runtime.
|
||||
|
@ -16,8 +15,8 @@ public abstract class ValidationMatcher implements NumberParseMatcher {
|
|||
}
|
||||
|
||||
@Override
|
||||
public UnicodeSet getLeadCodePoints() {
|
||||
return UnicodeSet.EMPTY;
|
||||
public boolean smokeTest(StringSegment segment) {
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
package com.ibm.icu.dev.test.number;
|
||||
|
||||
import static com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.get;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.Random;
|
||||
|
||||
|
@ -10,8 +12,12 @@ import org.junit.Test;
|
|||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.number.DecimalQuantity_DualStorageBCD;
|
||||
import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.number.NumberFormatter;
|
||||
import com.ibm.icu.number.Rounder;
|
||||
import com.ibm.icu.text.DecimalFormatSymbols;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
|
@ -27,6 +33,60 @@ public class ExhaustiveNumberTest extends TestFmwk {
|
|||
org.junit.Assume.assumeTrue(getExhaustiveness() > 5);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSetCoverage() {
|
||||
// Lenient comma/period should be supersets of strict comma/period;
|
||||
// it also makes the coverage logic cheaper.
|
||||
assertTrue("COMMA should be superset of STRICT_COMMA",
|
||||
get(Key.COMMA).containsAll(get(Key.STRICT_COMMA)));
|
||||
assertTrue("PERIOD should be superset of STRICT_PERIOD",
|
||||
get(Key.PERIOD).containsAll(get(Key.STRICT_PERIOD)));
|
||||
|
||||
UnicodeSet decimals = get(Key.STRICT_COMMA).cloneAsThawed().addAll(get(Key.STRICT_PERIOD))
|
||||
.freeze();
|
||||
UnicodeSet grouping = decimals.cloneAsThawed().addAll(get(Key.OTHER_GROUPING_SEPARATORS))
|
||||
.freeze();
|
||||
UnicodeSet plusSign = get(Key.PLUS_SIGN);
|
||||
UnicodeSet minusSign = get(Key.MINUS_SIGN);
|
||||
UnicodeSet percent = get(Key.PERCENT_SIGN);
|
||||
UnicodeSet permille = get(Key.PERMILLE_SIGN);
|
||||
UnicodeSet infinity = get(Key.INFINITY);
|
||||
|
||||
for (ULocale locale : ULocale.getAvailableLocales()) {
|
||||
DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(locale);
|
||||
|
||||
assertInSet(locale, decimals, dfs.getDecimalSeparatorString());
|
||||
assertInSet(locale, grouping, dfs.getGroupingSeparatorString());
|
||||
assertInSet(locale, plusSign, dfs.getPlusSignString());
|
||||
assertInSet(locale, minusSign, dfs.getMinusSignString());
|
||||
assertInSet(locale, percent, dfs.getPercentString());
|
||||
assertInSet(locale, permille, dfs.getPerMillString());
|
||||
assertInSet(locale, infinity, dfs.getInfinity());
|
||||
}
|
||||
}
|
||||
|
||||
static void assertInSet(ULocale locale, UnicodeSet set, String str) {
|
||||
if (str.codePointCount(0, str.length()) != 1) {
|
||||
// Ignore locale strings with more than one code point (usually a bidi mark)
|
||||
return;
|
||||
}
|
||||
assertInSet(locale, set, str.codePointAt(0));
|
||||
}
|
||||
|
||||
static void assertInSet(ULocale locale, UnicodeSet set, int cp) {
|
||||
// If this test case fails, add the specified code point to the corresponding set in
|
||||
// UnicodeSetStaticCache.java and numparse_unisets.cpp
|
||||
assertTrue(
|
||||
locale
|
||||
+ " U+"
|
||||
+ Integer.toHexString(cp)
|
||||
+ " ("
|
||||
+ UCharacter.toString(cp)
|
||||
+ ") should be in "
|
||||
+ set,
|
||||
set.contains(cp));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void unlimitedRoundingBigDecimal() {
|
||||
BigDecimal ten10000 = BigDecimal.valueOf(10).pow(10000);
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
package com.ibm.icu.dev.test.number;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import org.junit.Test;
|
||||
|
@ -21,8 +22,6 @@ import com.ibm.icu.impl.number.parse.ParsingUtils;
|
|||
import com.ibm.icu.impl.number.parse.PercentMatcher;
|
||||
import com.ibm.icu.impl.number.parse.PlusSignMatcher;
|
||||
import com.ibm.icu.impl.number.parse.SeriesMatcher;
|
||||
import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache;
|
||||
import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
|
||||
import com.ibm.icu.text.DecimalFormatSymbols;
|
||||
import com.ibm.icu.util.Currency;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
@ -196,7 +195,9 @@ public class NumberParserTest {
|
|||
series.addMatcher(IgnorablesMatcher.DEFAULT);
|
||||
series.freeze();
|
||||
|
||||
assertEquals(UnicodeSetStaticCache.get(Key.PLUS_SIGN), series.getLeadCodePoints());
|
||||
assertFalse(series.smokeTest(new StringSegment("x", false)));
|
||||
assertFalse(series.smokeTest(new StringSegment("-", false)));
|
||||
assertTrue(series.smokeTest(new StringSegment("+", false)));
|
||||
|
||||
Object[][] cases = new Object[][] {
|
||||
{ "", 0, true },
|
||||
|
|
|
@ -30,9 +30,9 @@ import org.junit.Test;
|
|||
|
||||
import com.ibm.icu.dev.test.serializable.SerializableTestUtility;
|
||||
import com.ibm.icu.impl.number.DecimalFormatProperties;
|
||||
import com.ibm.icu.impl.number.DecimalFormatProperties.ParseMode;
|
||||
import com.ibm.icu.impl.number.Padder.PadPosition;
|
||||
import com.ibm.icu.impl.number.PatternStringParser;
|
||||
import com.ibm.icu.impl.number.parse.NumberParserImpl.ParseMode;
|
||||
import com.ibm.icu.text.CompactDecimalFormat.CompactStyle;
|
||||
import com.ibm.icu.text.CurrencyPluralInfo;
|
||||
import com.ibm.icu.text.MeasureFormat.FormatWidth;
|
||||
|
|
|
@ -8,82 +8,17 @@ import static org.junit.Assert.assertTrue;
|
|||
import org.junit.Test;
|
||||
|
||||
import com.ibm.icu.impl.number.parse.UnicodeSetStaticCache.Key;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.DecimalFormatSymbols;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
* This test class is thin; most of it was moved to ExhaustiveNumberTest.
|
||||
* @author sffc
|
||||
*
|
||||
*/
|
||||
public class UnicodeSetStaticCacheTest {
|
||||
|
||||
@Test
|
||||
public void testSetCoverage() {
|
||||
// Lenient comma/period should be supersets of strict comma/period;
|
||||
// it also makes the coverage logic cheaper.
|
||||
assertTrue("COMMA should be superset of STRICT_COMMA",
|
||||
get(Key.COMMA).containsAll(get(Key.STRICT_COMMA)));
|
||||
assertTrue("PERIOD should be superset of STRICT_PERIOD",
|
||||
get(Key.PERIOD).containsAll(get(Key.STRICT_PERIOD)));
|
||||
|
||||
UnicodeSet decimals = get(Key.STRICT_COMMA).cloneAsThawed().addAll(get(Key.STRICT_PERIOD))
|
||||
.freeze();
|
||||
UnicodeSet grouping = decimals.cloneAsThawed().addAll(get(Key.OTHER_GROUPING_SEPARATORS))
|
||||
.freeze();
|
||||
UnicodeSet plusSign = get(Key.PLUS_SIGN);
|
||||
UnicodeSet minusSign = get(Key.MINUS_SIGN);
|
||||
UnicodeSet percent = get(Key.PERCENT_SIGN);
|
||||
UnicodeSet permille = get(Key.PERMILLE_SIGN);
|
||||
UnicodeSet infinity = get(Key.INFINITY);
|
||||
UnicodeSet nanLead = get(Key.NAN_LEAD);
|
||||
UnicodeSet scientificLead = get(Key.SCIENTIFIC_LEAD);
|
||||
|
||||
for (ULocale locale : ULocale.getAvailableLocales()) {
|
||||
DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance(locale);
|
||||
|
||||
assertInSet(locale, decimals, dfs.getDecimalSeparatorString());
|
||||
assertInSet(locale, grouping, dfs.getGroupingSeparatorString());
|
||||
assertInSet(locale, plusSign, dfs.getPlusSignString());
|
||||
assertInSet(locale, minusSign, dfs.getMinusSignString());
|
||||
assertInSet(locale, percent, dfs.getPercentString());
|
||||
assertInSet(locale, permille, dfs.getPerMillString());
|
||||
assertInSet(locale, infinity, dfs.getInfinity());
|
||||
assertInSet(locale, nanLead, dfs.getNaN().codePointAt(0));
|
||||
assertInSet(locale, nanLead, UCharacter.foldCase(dfs.getNaN(), true).codePointAt(0));
|
||||
assertInSet(locale,
|
||||
scientificLead,
|
||||
UCharacter.foldCase(dfs.getExponentSeparator(), true).codePointAt(0));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFrozen() {
|
||||
for (Key key : Key.values()) {
|
||||
assertTrue(get(key).isFrozen());
|
||||
}
|
||||
}
|
||||
|
||||
static void assertInSet(ULocale locale, UnicodeSet set, String str) {
|
||||
if (str.codePointCount(0, str.length()) != 1) {
|
||||
// Ignore locale strings with more than one code point (usually a bidi mark)
|
||||
return;
|
||||
}
|
||||
assertInSet(locale, set, str.codePointAt(0));
|
||||
}
|
||||
|
||||
static void assertInSet(ULocale locale, UnicodeSet set, int cp) {
|
||||
// If this test case fails, add the specified code point to the corresponding set in
|
||||
// UnicodeSetStaticCache.java and numparse_unisets.cpp
|
||||
assertTrue(
|
||||
locale
|
||||
+ " U+"
|
||||
+ Integer.toHexString(cp)
|
||||
+ " ("
|
||||
+ UCharacter.toString(cp)
|
||||
+ ") should be in "
|
||||
+ set,
|
||||
set.contains(cp));
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue