ICU-13574 AffixMatcher is working. All simple parsing tests are passing.

X-SVN-Rev: 40903
This commit is contained in:
Shane Carr 2018-02-13 02:23:52 +00:00
parent 7b1857d0f3
commit 1ed7deaa8c
20 changed files with 313 additions and 168 deletions

View file

@ -5,6 +5,10 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_types.h"
#include "numparse_affixes.h"
#include "numparse_utils.h"
@ -122,52 +126,32 @@ AffixPatternMatcher AffixPatternMatcherBuilder::build() {
}
AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const UChar* currencyCode,
const UnicodeString* currency1,
const UnicodeString* currency2,
const DecimalFormatSymbols* dfs,
IgnorablesMatcher* ignorables, const Locale* locale)
: currency1(currency1),
currency2(currency2),
dfs(dfs),
ignorables(ignorables),
locale(locale),
codePointCount(0),
codePointNumBatches(0) {
utils::copyCurrencyCode(this->currencyCode, currencyCode);
}
CodePointMatcherWarehouse::CodePointMatcherWarehouse()
: codePointCount(0), codePointNumBatches(0) {}
AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(
AffixTokenMatcherWarehouse&& src) U_NOEXCEPT = default;
AffixTokenMatcherWarehouse::~AffixTokenMatcherWarehouse() {
CodePointMatcherWarehouse::~CodePointMatcherWarehouse() {
// Delete the variable number of batches of code point matchers
for (int32_t i = 0; i < codePointNumBatches; i++) {
delete[] codePointsOverflow[i];
}
}
NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
return fMinusSign = {*dfs, true};
CodePointMatcherWarehouse::CodePointMatcherWarehouse(CodePointMatcherWarehouse&& src) U_NOEXCEPT
: codePoints(std::move(src.codePoints)),
codePointsOverflow(std::move(src.codePointsOverflow)),
codePointCount(src.codePointCount),
codePointNumBatches(src.codePointNumBatches) {}
CodePointMatcherWarehouse&
CodePointMatcherWarehouse::operator=(CodePointMatcherWarehouse&& src) U_NOEXCEPT {
codePoints = std::move(src.codePoints);
codePointsOverflow = std::move(src.codePointsOverflow);
codePointCount = src.codePointCount;
codePointNumBatches = src.codePointNumBatches;
return *this;
}
NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
return fPlusSign = {*dfs, true};
}
NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
return fPercent = {*dfs};
}
NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
return fPermille = {*dfs};
}
NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
return fCurrency = {{*locale, status}, {currencyCode, *currency1, *currency2}};
}
NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
NumberParseMatcher& CodePointMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
if (codePointCount < CODE_POINT_STACK_CAPACITY) {
return codePoints[codePointCount++] = {cp};
}
@ -186,6 +170,39 @@ NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp)
}
AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
: fSetupData(setupData) {}
NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
return fMinusSign = {fSetupData->dfs, true};
}
NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
return fPlusSign = {fSetupData->dfs, true};
}
NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
return fPercent = {fSetupData->dfs};
}
NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
return fPermille = {fSetupData->dfs};
}
NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
return fCurrency = {{fSetupData->locale, status},
{fSetupData->currencyCode, fSetupData->currency1, fSetupData->currency2}};
}
IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
return fSetupData->ignorables;
}
NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
return fCodePoints.nextCodePointMatcher(cp);
}
CodePointMatcher::CodePointMatcher(UChar32 cp)
: fCp(cp) {}
@ -207,9 +224,13 @@ const UnicodeSet& CodePointMatcher::getLeadCodePoints() {
return *fLocalLeadCodePoints;
}
UnicodeString CodePointMatcher::toString() const {
return u"<CodePoint>";
}
AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
AffixTokenMatcherWarehouse& warehouse,
AffixTokenMatcherWarehouse& tokenWarehouse,
parse_flags_t parseFlags, bool* success,
UErrorCode& status) {
if (affixPattern.isEmpty()) {
@ -222,10 +243,10 @@ AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& a
if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
ignorables = nullptr;
} else {
ignorables = warehouse.ignorables;
ignorables = &tokenWarehouse.ignorables();
}
AffixPatternMatcherBuilder builder(affixPattern, warehouse, ignorables);
AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
AffixUtils::iterateWithConsumer(UnicodeStringCharSequence(affixPattern), builder, status);
return builder.build();
}
@ -243,10 +264,9 @@ bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
}
AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse& warehouse)
: fAffixTokenMatcherWarehouse(std::move(warehouse)) {}
AffixMatcherWarehouse& AffixMatcherWarehouse::operator=(AffixMatcherWarehouse&& src) = default;
AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
: fTokenWarehouse(tokenWarehouse) {
}
bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
@ -278,18 +298,14 @@ bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInf
return true;
}
AffixMatcherWarehouse AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
MutableMatcherCollection& output,
AffixTokenMatcherWarehouse tokenWarehouse,
const IgnorablesMatcher& ignorables,
parse_flags_t parseFlags,
UErrorCode& status) {
void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
MutableMatcherCollection& output,
const IgnorablesMatcher& ignorables,
parse_flags_t parseFlags, UErrorCode& status) {
if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
return {};
return;
}
AffixMatcherWarehouse warehouse(tokenWarehouse);
// The affixes have interesting characters, or we are in strict mode.
// Use initial capacity of 6, the highest possible number of AffixMatchers.
UnicodeString sb;
@ -309,21 +325,19 @@ AffixMatcherWarehouse AffixMatcherWarehouse::createAffixMatchers(const AffixPatt
bool hasPrefix = false;
PatternStringUtils::patternInfoToStringBuilder(
patternInfo, true, signum, signDisplay, StandardPlural::OTHER, false, sb);
warehouse.fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
sb, tokenWarehouse, parseFlags, &hasPrefix, status);
AffixPatternMatcher* prefix = hasPrefix
? &warehouse.fAffixPatternMatchers[numAffixPatternMatchers++]
: nullptr;
fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
: nullptr;
// Generate Suffix
bool hasSuffix = false;
PatternStringUtils::patternInfoToStringBuilder(
patternInfo, false, signum, signDisplay, StandardPlural::OTHER, false, sb);
warehouse.fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
sb, tokenWarehouse, parseFlags, &hasSuffix, status);
AffixPatternMatcher* suffix = hasSuffix
? &warehouse.fAffixPatternMatchers[numAffixPatternMatchers++]
: nullptr;
fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
: nullptr;
if (signum == 1) {
posPrefix = prefix;
@ -338,14 +352,14 @@ AffixMatcherWarehouse AffixMatcherWarehouse::createAffixMatchers(const AffixPatt
// Note: it is indeed possible for posPrefix and posSuffix to both be null.
// We still need to add that matcher for strict mode to work.
warehouse.fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
// The following if statements are designed to prevent adding two identical matchers.
if (signum == 1 || equals(prefix, posPrefix)) {
warehouse.fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
if (signum == 1 || !equals(prefix, posPrefix)) {
fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
}
if (signum == 1 || equals(suffix, posSuffix)) {
warehouse.fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
if (signum == 1 || !equals(suffix, posSuffix)) {
fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
}
}
}
@ -356,19 +370,20 @@ AffixMatcherWarehouse AffixMatcherWarehouse::createAffixMatchers(const AffixPatt
do {
madeChanges = false;
for (int32_t i = 1; i < numAffixMatchers; i++) {
if (warehouse.fAffixMatchers[i - 1].compareTo(warehouse.fAffixMatchers[i]) > 0) {
if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
madeChanges = true;
AffixMatcher temp = std::move(warehouse.fAffixMatchers[i - 1]);
warehouse.fAffixMatchers[i - 1] = std::move(warehouse.fAffixMatchers[i]);
warehouse.fAffixMatchers[i] = std::move(temp);
AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
fAffixMatchers[i] = std::move(temp);
}
}
} while (madeChanges);
for (int32_t i = 0; i < numAffixMatchers; i++) {
output.addMatcher(warehouse.fAffixMatchers[i]);
}
return warehouse;
for (int32_t i = 0; i < numAffixMatchers; i++) {
// Enable the following line to debug affixes
//std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
output.addMatcher(fAffixMatchers[i]);
}
}
@ -454,6 +469,14 @@ int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
}
}
UnicodeString AffixMatcher::toString() const {
bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
(fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
(fSuffix ? fSuffix->getPattern() : u"null") + u">";
}
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -12,6 +12,8 @@
#include "numparse_currency.h"
#include "number_affixutils.h"
#include <array>
U_NAMESPACE_BEGIN namespace numparse {
namespace impl {
@ -33,11 +35,57 @@ class CodePointMatcher : public NumberParseMatcher, public UMemory {
const UnicodeSet& getLeadCodePoints() override;
UnicodeString toString() const override;
private:
UChar32 fCp;
};
/**
* A warehouse to retain ownership of CodePointMatchers.
*/
class CodePointMatcherWarehouse : public UMemory {
private:
static constexpr int32_t CODE_POINT_STACK_CAPACITY = 5; // Number of entries directly on the stack
static constexpr int32_t CODE_POINT_BATCH_SIZE = 10; // Number of entries per heap allocation
public:
CodePointMatcherWarehouse();
// A custom destructor is needed to free the memory from MaybeStackArray.
// A custom move constructor and move assignment seem to be needed because of the custom destructor.
~CodePointMatcherWarehouse();
CodePointMatcherWarehouse(CodePointMatcherWarehouse&& src) U_NOEXCEPT;
CodePointMatcherWarehouse& operator=(CodePointMatcherWarehouse&& src) U_NOEXCEPT;
NumberParseMatcher& nextCodePointMatcher(UChar32 cp);
private:
std::array<CodePointMatcher, CODE_POINT_STACK_CAPACITY> codePoints; // By value
MaybeStackArray<CodePointMatcher*, 3> codePointsOverflow; // On heap in "batches"
int32_t codePointCount; // Total for both the ones by value and on heap
int32_t codePointNumBatches; // Number of batches in codePointsOverflow
};
struct AffixTokenMatcherSetupData {
const UChar* currencyCode;
const UnicodeString& currency1;
const UnicodeString& currency2;
const DecimalFormatSymbols& dfs;
IgnorablesMatcher& ignorables;
const Locale& locale;
// const UChar* currencyCode, const UnicodeString* currency1,
// const UnicodeString* currency2, const DecimalFormatSymbols* dfs,
// IgnorablesMatcher* ignorables, const Locale* locale
};
/**
* Small helper class that generates matchers for individual tokens for AffixPatternMatcher.
*
@ -48,21 +96,11 @@ class CodePointMatcher : public NumberParseMatcher, public UMemory {
*
* @author sffc
*/
class AffixTokenMatcherWarehouse {
private:
static constexpr int32_t CODE_POINT_STACK_CAPACITY = 5; // Number of entries directly on the stack
static constexpr int32_t CODE_POINT_BATCH_SIZE = 10; // Number of entries per heap allocation
class AffixTokenMatcherWarehouse : public UMemory {
public:
AffixTokenMatcherWarehouse() = default; // WARNING: Leaves the object in an unusable state
AffixTokenMatcherWarehouse(const UChar* currencyCode, const UnicodeString* currency1,
const UnicodeString* currency2, const DecimalFormatSymbols* dfs,
IgnorablesMatcher* ignorables, const Locale* locale);
AffixTokenMatcherWarehouse(AffixTokenMatcherWarehouse&& src) U_NOEXCEPT;
~AffixTokenMatcherWarehouse();
AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData);
NumberParseMatcher& minusSign();
@ -74,16 +112,13 @@ class AffixTokenMatcherWarehouse {
NumberParseMatcher& currency(UErrorCode& status);
IgnorablesMatcher& ignorables();
NumberParseMatcher& nextCodePointMatcher(UChar32 cp);
private:
// NOTE: The following fields may be unsafe to access after construction is done!
UChar currencyCode[4];
const UnicodeString* currency1;
const UnicodeString* currency2;
const DecimalFormatSymbols* dfs;
IgnorablesMatcher* ignorables;
const Locale* locale;
// NOTE: The following field may be unsafe to access after construction is done!
const AffixTokenMatcherSetupData* fSetupData;
// NOTE: These are default-constructed and should not be used until initialized.
MinusSignMatcher fMinusSign;
@ -92,10 +127,8 @@ class AffixTokenMatcherWarehouse {
PermilleMatcher fPermille;
CurrencyAnyMatcher fCurrency;
CodePointMatcher codePoints[CODE_POINT_STACK_CAPACITY]; // By value
MaybeStackArray<CodePointMatcher*, 3> codePointsOverflow; // On heap in "batches"
int32_t codePointCount; // Total for both the ones by value and on heap
int32_t codePointNumBatches; // Number of batches in codePointsOverflow
// Use a child class for code point matchers, since it requires non-default operators.
CodePointMatcherWarehouse fCodePoints;
friend class AffixPatternMatcherBuilder;
friend class AffixPatternMatcher;
@ -161,6 +194,8 @@ class AffixMatcher : public NumberParseMatcher, public UMemory {
int8_t compareTo(const AffixMatcher& rhs) const;
UnicodeString toString() const override;
private:
AffixPatternMatcher* fPrefix;
AffixPatternMatcher* fSuffix;
@ -175,23 +210,19 @@ class AffixMatcherWarehouse {
public:
AffixMatcherWarehouse() = default; // WARNING: Leaves the object in an unusable state
AffixMatcherWarehouse(AffixTokenMatcherWarehouse& warehouse);
AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse);
AffixMatcherWarehouse& operator=(AffixMatcherWarehouse&& src);
static AffixMatcherWarehouse createAffixMatchers(const AffixPatternProvider& patternInfo,
MutableMatcherCollection& output,
AffixTokenMatcherWarehouse tokenWarehouse,
const IgnorablesMatcher& ignorables,
parse_flags_t parseFlags, UErrorCode& status);
void createAffixMatchers(const AffixPatternProvider& patternInfo, MutableMatcherCollection& output,
const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
UErrorCode& status);
private:
// 9 is the limit: positive, zero, and negative, each with prefix, suffix, and prefix+suffix
AffixMatcher fAffixMatchers[9];
// 6 is the limit: positive, zero, and negative, a prefix and a suffix for each
AffixPatternMatcher fAffixPatternMatchers[6];
// Store all the tokens used by the AffixPatternMatchers
AffixTokenMatcherWarehouse fAffixTokenMatcherWarehouse;
// Reference to the warehouse for tokens used by the AffixPatternMatchers
AffixTokenMatcherWarehouse* fTokenWarehouse;
friend class AffixMatcher;

View file

@ -5,6 +5,10 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_types.h"
#include "numparse_compositions.h"
#include "unicode/uniset.h"
@ -113,5 +117,9 @@ const NumberParseMatcher* const* ArraySeriesMatcher::end() const {
return fMatchers.getAlias() + fMatchersLen;
}
UnicodeString ArraySeriesMatcher::toString() const {
return u"<ArraySeries>";
}
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -87,6 +87,8 @@ class ArraySeriesMatcher : public SeriesMatcher {
const UnicodeSet& getLeadCodePoints() override;
UnicodeString toString() const override;
int32_t length() const override;
protected:

View file

@ -5,6 +5,10 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_types.h"
#include "numparse_currency.h"
#include "ucurrimp.h"
@ -66,6 +70,10 @@ const UnicodeSet& CurrencyNamesMatcher::getLeadCodePoints() {
return *fLocalLeadCodePoints;
}
UnicodeString CurrencyNamesMatcher::toString() const {
return u"<CurrencyNames>";
}
CurrencyCustomMatcher::CurrencyCustomMatcher(const char16_t* currencyCode, const UnicodeString& currency1,
const UnicodeString& currency2)
@ -106,6 +114,10 @@ const UnicodeSet& CurrencyCustomMatcher::getLeadCodePoints() {
return *fLocalLeadCodePoints;
}
UnicodeString CurrencyCustomMatcher::toString() const {
return u"<CurrencyCustom>";
}
CurrencyAnyMatcher::CurrencyAnyMatcher() {
fMatcherArray[0] = &fNamesMatcher;
@ -151,5 +163,9 @@ const NumberParseMatcher* const* CurrencyAnyMatcher::end() const {
return fMatcherArray + 2;
}
UnicodeString CurrencyAnyMatcher::toString() const {
return u"<CurrencyAny>";
}
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -32,6 +32,8 @@ class CurrencyNamesMatcher : public NumberParseMatcher, public UMemory {
const UnicodeSet& getLeadCodePoints() override;
UnicodeString toString() const override;
private:
// We could use Locale instead of CharString here, but
// Locale has a non-trivial default constructor.
@ -51,6 +53,8 @@ class CurrencyCustomMatcher : public NumberParseMatcher, public UMemory {
const UnicodeSet& getLeadCodePoints() override;
UnicodeString toString() const override;
private:
UChar fCurrencyCode[4];
UnicodeString fCurrency1;
@ -75,6 +79,8 @@ class CurrencyAnyMatcher : public AnyMatcher, public UMemory {
const UnicodeSet& getLeadCodePoints() override;
UnicodeString toString() const override;
protected:
const NumberParseMatcher* const* begin() const override;

View file

@ -5,6 +5,10 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_types.h"
#include "numparse_decimal.h"
#include "numparse_unisets.h"
@ -312,5 +316,9 @@ const UnicodeSet& DecimalMatcher::getLeadCodePoints() {
return *fLocalLeadCodePoints;
}
UnicodeString DecimalMatcher::toString() const {
return u"<Decimal>";
}
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -29,6 +29,8 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory {
const UnicodeSet& getLeadCodePoints() override;
UnicodeString toString() const override;
private:
/** If true, only accept strings whose grouping sizes match the locale */
bool requireGroupingMatch;

View file

@ -5,7 +5,8 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "number_types.h"
@ -17,6 +18,9 @@
#include "unicode/numberformatter.h"
#include <typeinfo>
#include <array>
#include <iostream>
#include "cstr.h"
using namespace icu;
using namespace icu::number;
@ -35,24 +39,20 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString&
parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES};
IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables;
const UChar currencyCode[] = u"USD";
UnicodeString currency1(u"IU$");
UnicodeString currency2(u"ICU");
ParsedPatternInfo patternInfo;
PatternParser::parseToPatternInfo(patternString, patternInfo, status);
// The following statement sets up the affix matchers.
// AffixMatcherWarehouse warehouse = ;
parser->fLocalMatchers.affixMatcherWarehouse = std::move(AffixMatcherWarehouse::createAffixMatchers(
patternInfo,
*parser,
AffixTokenMatcherWarehouse(
u"USD", &currency1, &currency2, &symbols, &ignorables, &locale),
ignorables,
parseFlags,
status));
// The following statements set up the affix matchers.
AffixTokenMatcherSetupData affixSetupData = {
currencyCode, currency1, currency2, symbols, ignorables, locale};
parser->fLocalMatchers.affixTokenMatcherWarehouse = {&affixSetupData};
parser->fLocalMatchers.affixMatcherWarehouse = {&parser->fLocalMatchers.affixTokenMatcherWarehouse};
parser->fLocalMatchers.affixMatcherWarehouse.createAffixMatchers(
patternInfo, *parser, ignorables, parseFlags, status);
Grouper grouper = Grouper::forStrategy(UNUM_GROUPING_AUTO);
grouper.setLocaleData(patternInfo, locale);
@ -233,7 +233,7 @@ UnicodeString NumberParserImpl::toString() const {
UnicodeString result(u"<NumberParserImpl matchers:[");
for (int32_t i = 0; i < fNumMatchers; i++) {
result.append(u' ');
result.append(UnicodeString(typeid(*fMatchers[i]).name()));
result.append(fMatchers[i]->toString());
}
result.append(u" ]>", -1);
return result;

View file

@ -60,6 +60,7 @@ class NumberParserImpl : public MutableMatcherCollection {
ScientificMatcher scientific;
CurrencyNamesMatcher currencyNames;
AffixMatcherWarehouse affixMatcherWarehouse;
AffixTokenMatcherWarehouse affixTokenMatcherWarehouse;
} fLocalMatchers;
NumberParserImpl(parse_flags_t parseFlags, bool computeLeads);

View file

@ -5,6 +5,10 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_types.h"
#include <cmath>
@ -67,7 +71,11 @@ double ParsedNumber::getDouble() const {
}
// TODO: MIN_LONG
return quantity.toDouble();
double d = quantity.toDouble();
if (0 != (flags & FLAG_NEGATIVE)) {
d *= -1;
}
return d;
}
bool ParsedNumber::isBetterThan(const ParsedNumber& other) {

View file

@ -5,6 +5,10 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_types.h"
#include "numparse_scientific.h"
#include "numparse_unisets.h"
@ -83,5 +87,9 @@ const UnicodeSet& ScientificMatcher::getLeadCodePoints() {
return *fLocalLeadCodePoints;
}
UnicodeString ScientificMatcher::toString() const {
return u"<Scientific>";
}
#endif /* #if !UCONFIG_NO_FORMATTING */

View file

@ -27,6 +27,8 @@ class ScientificMatcher : public NumberParseMatcher, public UMemory {
const UnicodeSet& getLeadCodePoints() override;
UnicodeString toString() const override;
private:
UnicodeString fExponentSeparatorString;
DecimalMatcher fExponentMatcher;

View file

@ -5,6 +5,10 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_types.h"
#include "numparse_stringsegment.h"
#include "putilimp.h"

View file

@ -5,6 +5,10 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_types.h"
#include "numparse_symbols.h"
#include "numparse_utils.h"
@ -70,6 +74,11 @@ const UnicodeSet& SymbolMatcher::getLeadCodePoints() {
return *fLocalLeadCodePoints;
}
UnicodeString SymbolMatcher::toString() const {
// TODO: Customize output for each symbol
return u"<Symbol>";
}
IgnorablesMatcher::IgnorablesMatcher(unisets::Key key)
: SymbolMatcher({}, key) {
@ -79,6 +88,10 @@ bool IgnorablesMatcher::isFlexible() const {
return true;
}
UnicodeString IgnorablesMatcher::toString() const {
return u"<Ignorables>";
}
bool IgnorablesMatcher::isDisabled(const ParsedNumber&) const {
return false;
}

View file

@ -30,6 +30,8 @@ class SymbolMatcher : public NumberParseMatcher, public UMemory {
const UnicodeSet& getLeadCodePoints() override;
UnicodeString toString() const override;
virtual bool isDisabled(const ParsedNumber& result) const = 0;
virtual void accept(StringSegment& segment, ParsedNumber& result) const = 0;
@ -50,6 +52,8 @@ class IgnorablesMatcher : public SymbolMatcher {
bool isFlexible() const override;
UnicodeString toString() const override;
protected:
bool isDisabled(const ParsedNumber& result) const override;

View file

@ -318,6 +318,9 @@ class NumberParseMatcher {
// Default implementation: no-op
};
// String for debugging
virtual UnicodeString toString() const = 0;
protected:
// No construction except by subclasses!
NumberParseMatcher() = default;

View file

@ -5,8 +5,8 @@
#if !UCONFIG_NO_FORMATTING && !UPRV_INCOMPLETE_CPP11_SUPPORT
// Allow implicit conversion from char16_t* to UnicodeString for this file
// (useful for UnicodeSet constructor)
// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT
#include "numparse_unisets.h"

View file

@ -69,33 +69,33 @@ void NumberParserTest::testBasic() {
{3, u"-∞", u"0", 2, -INFINITY},
{3, u"@@@123 @@", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak?
{3, u"@@@123@@ ", u"0", 6, 123.}, // TODO: Should padding be strong instead of weak?
// {3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.},
// {3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.},
{3, u"a51423US dollars", u"a0¤¤¤", 16, 51423.},
{3, u"a 51423 US dollars", u"a0¤¤¤", 18, 51423.},
{3, u"514.23 USD", u"¤0", 10, 514.23},
{3, u"514.23 GBP", u"¤0", 10, 514.23},
// {3, u"a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 14, 51423.},
// {3, u"-a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
// {3, u"a -𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
{3, u"a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 14, 51423.},
{3, u"-a 𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
{3, u"a -𝟱𝟭𝟰𝟮𝟯 b", u"a0b", 15, -51423.},
{3, u"𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 10, 51423.},
{3, u"[𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, 51423.},
{3, u"𝟱𝟭𝟰𝟮𝟯]", u"[0];(0)", 11, 51423.},
{3, u"[𝟱𝟭𝟰𝟮𝟯]", u"[0];(0)", 12, 51423.},
// {3, u"(𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, -51423.},
// {3, u"𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 11, -51423.},
// {3, u"(𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 12, -51423.},
// {3, u"𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 10, 51423.},
// {3, u"{𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 11, 51423.},
// {3, u"𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 11, 51423.},
// {3, u"{𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 12, 51423.},
// {1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
// {2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b"
{3, u"(𝟱𝟭𝟰𝟮𝟯", u"[0];(0)", 11, -51423.},
{3, u"𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 11, -51423.},
{3, u"(𝟱𝟭𝟰𝟮𝟯)", u"[0];(0)", 12, -51423.},
{3, u"𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 10, 51423.},
{3, u"{𝟱𝟭𝟰𝟮𝟯", u"{0};{0}", 11, 51423.},
{3, u"𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 11, 51423.},
{3, u"{𝟱𝟭𝟰𝟮𝟯}", u"{0};{0}", 12, 51423.},
{1, u"a40b", u"a0'0b'", 3, 40.}, // greedy code path thinks "40" is the number
{2, u"a40b", u"a0'0b'", 4, 4.}, // slow code path finds the suffix "0b"
{3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.},
{3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142},
{3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142},
{7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5},
// {3, u"a$ b5", u"a ¤ b0", 5, 5.0},
// {3, u"📺1.23", u"📺0;📻0", 6, 1.23},
// {3, u"📻1.23", u"📺0;📻0", 6, -1.23},
{3, u"a$ b5", u"a ¤ b0", 5, 5.0},
{3, u"📺1.23", u"📺0;📻0", 6, 1.23},
{3, u"📻1.23", u"📺0;📻0", 6, -1.23},
{3, u".00", u"0", 3, 0.0},
{3, u" 1,234", u"a0", 35, 1234.}, // should not hang
{3, u"NaN", u"0", 3, NAN},
@ -215,27 +215,29 @@ void NumberParserTest::testSeriesMatcher() {
void NumberParserTest::testCurrencyAnyMatcher() {
IcuTestErrorCode status(*this, "testCurrencyAnyMatcher");
UnicodeString currency1(u"IU$");
UnicodeString currency2(u"ICU");
DecimalFormatSymbols symbols("en", status);
IgnorablesMatcher ignorables(unisets::DEFAULT_IGNORABLES);
Locale locale("en");
AffixTokenMatcherWarehouse warehouse(u"ICU", &currency1, &currency2, &symbols, &ignorables, &locale);
AffixTokenMatcherSetupData affixSetupData = {
u"ICU",
u"IU$",
u"ICU",
{"en", status},
ignorables,
"en"};
AffixTokenMatcherWarehouse warehouse(&affixSetupData);
NumberParseMatcher& matcher = warehouse.currency(status);
static const struct TestCase{
static const struct TestCase {
const char16_t* input;
const char16_t* expectedCurrencyCode;
} cases[] {
{ u"", u"\x00" },
{ u"FOO", u"\x00" },
{ u"USD", u"USD" },
{ u"$", u"USD" },
{ u"US dollars", u"USD" },
{ u"eu", u"\x00" },
{ u"euros", u"EUR" },
{ u"ICU", u"ICU" },
{ u"IU$", u"ICU" } };
} cases[]{{u"", u"\x00"},
{u"FOO", u"\x00"},
{u"USD", u"USD"},
{u"$", u"USD"},
{u"US dollars", u"USD"},
{u"eu", u"\x00"},
{u"euros", u"EUR"},
{u"ICU", u"ICU"},
{u"IU$", u"ICU"}};
for (auto& cas : cases) {
UnicodeString input(cas.input);
@ -243,7 +245,8 @@ void NumberParserTest::testCurrencyAnyMatcher() {
ParsedNumber result;
matcher.match(segment, result, status);
assertEquals("Parsing " + input, cas.expectedCurrencyCode, result.currencyCode);
assertEquals("Whole string on " + input,
assertEquals(
"Whole string on " + input,
cas.expectedCurrencyCode[0] == 0 ? 0 : input.length(),
result.charEnd);
}
@ -251,13 +254,15 @@ void NumberParserTest::testCurrencyAnyMatcher() {
void NumberParserTest::testAffixPatternMatcher() {
IcuTestErrorCode status(*this, "testAffixPatternMatcher");
UnicodeString currency1(u"foo");
UnicodeString currency2(u"bar");
DecimalFormatSymbols symbols("en", status);
IgnorablesMatcher ignorables(unisets::DEFAULT_IGNORABLES);
Locale locale("en");
AffixTokenMatcherWarehouse warehouse(u"EUR", &currency1, &currency2, &symbols, &ignorables, &locale);
AffixTokenMatcherSetupData affixSetupData = {
u"USD",
u"foo",
u"bar",
{"en", status},
ignorables,
"en"};
AffixTokenMatcherWarehouse warehouse(&affixSetupData);
static const struct TestCase {
bool exactMatch;
@ -269,8 +274,7 @@ void NumberParserTest::testAffixPatternMatcher() {
{true, u"+-%", 3, u"+-%"},
{false, u"ab c", 5, u"a bc"},
{true, u"abc", 3, u"abc"},
{false, u"hello-to+this%very¤long‰string", 59, u"hello-to+this%very USD long‰string"}
};
{false, u"hello-to+this%very¤long‰string", 59, u"hello-to+this%very USD long‰string"}};
for (auto& cas : cases) {
UnicodeString affixPattern(cas.affixPattern);

View file

@ -9,6 +9,8 @@ import com.ibm.icu.text.UnicodeSet;
* A mutable class allowing for a String with a variable offset and length. The charAt, length, and
* subSequence methods all operate relative to the fixed offset into the String.
*
* TODO: Make sure that this operates only on code point boundaries.
*
* @author sffc
*/
public class StringSegment implements CharSequence {