ICU-20616 Allow bidi marks around the sign in exponent parsing.

This commit is contained in:
Shane Carr 2019-05-21 15:36:25 -07:00 committed by Shane F. Carr
parent afa9b9b48e
commit c8c3fbca28
14 changed files with 140 additions and 61 deletions

View file

@ -39,7 +39,7 @@ NumberParserImpl::createSimpleParser(const Locale& locale, const UnicodeString&
LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
DecimalFormatSymbols symbols(locale, status);
parser->fLocalMatchers.ignorables = {unisets::DEFAULT_IGNORABLES};
parser->fLocalMatchers.ignorables = {parseFlags};
IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables;
DecimalFormatSymbols dfs(locale, status);
@ -114,6 +114,7 @@ NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatPr
parseFlags |= PARSE_FLAG_STRICT_SEPARATORS;
parseFlags |= PARSE_FLAG_USE_FULL_AFFIXES;
parseFlags |= PARSE_FLAG_EXACT_AFFIX;
parseFlags |= PARSE_FLAG_STRICT_IGNORABLES;
} else {
parseFlags |= PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
}
@ -129,8 +130,7 @@ NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatPr
LocalPointer<NumberParserImpl> parser(new NumberParserImpl(parseFlags));
parser->fLocalMatchers.ignorables = {
isStrict ? unisets::STRICT_IGNORABLES : unisets::DEFAULT_IGNORABLES};
parser->fLocalMatchers.ignorables = {parseFlags};
IgnorablesMatcher& ignorables = parser->fLocalMatchers.ignorables;
//////////////////////

View file

@ -34,7 +34,8 @@ inline const UnicodeSet& plusSignSet() {
ScientificMatcher::ScientificMatcher(const DecimalFormatSymbols& dfs, const Grouper& grouper)
: fExponentSeparatorString(dfs.getConstSymbol(DecimalFormatSymbols::kExponentialSymbol)),
fExponentMatcher(dfs, grouper, PARSE_FLAG_INTEGER_ONLY | PARSE_FLAG_GROUPING_DISABLED) {
fExponentMatcher(dfs, grouper, PARSE_FLAG_INTEGER_ONLY | PARSE_FLAG_GROUPING_DISABLED),
fIgnorablesMatcher(PARSE_FLAG_STRICT_IGNORABLES) {
const UnicodeString& minusSign = dfs.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
if (minusSignSet().contains(minusSign)) {
@ -64,15 +65,25 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr
// First match the scientific separator, and then match another number after it.
// NOTE: This is guarded by the smoke test; no need to check fExponentSeparatorString length again.
int overlap1 = segment.getCommonPrefixLength(fExponentSeparatorString);
if (overlap1 == fExponentSeparatorString.length()) {
int32_t initialOffset = segment.getOffset();
int32_t overlap = segment.getCommonPrefixLength(fExponentSeparatorString);
if (overlap == fExponentSeparatorString.length()) {
// Full exponent separator match.
// First attempt to get a code point, returning true if we can't get one.
if (segment.length() == overlap1) {
if (segment.length() == overlap) {
return true;
}
segment.adjustOffset(overlap);
// Allow ignorables before the sign.
// Note: call site is guarded by the segment.length() check above.
// Note: the ignorables matcher should not touch the result.
fIgnorablesMatcher.match(segment, result, status);
if (segment.length() == 0) {
segment.setOffset(initialOffset);
return true;
}
segment.adjustOffset(overlap1);
// Allow a sign, and then try to match digits.
int8_t exponentSign = 1;
@ -82,24 +93,37 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr
} else if (segment.startsWith(plusSignSet())) {
segment.adjustOffsetByCodePoint();
} else if (segment.startsWith(fCustomMinusSign)) {
// Note: call site is guarded with startsWith, which returns false on empty string
int32_t overlap2 = segment.getCommonPrefixLength(fCustomMinusSign);
if (overlap2 != fCustomMinusSign.length()) {
// Partial custom sign match; un-match the exponent separator.
segment.adjustOffset(-overlap1);
overlap = segment.getCommonPrefixLength(fCustomMinusSign);
if (overlap != fCustomMinusSign.length()) {
// Partial custom sign match
segment.setOffset(initialOffset);
return true;
}
exponentSign = -1;
segment.adjustOffset(overlap2);
segment.adjustOffset(overlap);
} else if (segment.startsWith(fCustomPlusSign)) {
// Note: call site is guarded with startsWith, which returns false on empty string
int32_t overlap2 = segment.getCommonPrefixLength(fCustomPlusSign);
if (overlap2 != fCustomPlusSign.length()) {
// Partial custom sign match; un-match the exponent separator.
segment.adjustOffset(-overlap1);
overlap = segment.getCommonPrefixLength(fCustomPlusSign);
if (overlap != fCustomPlusSign.length()) {
// Partial custom sign match
segment.setOffset(initialOffset);
return true;
}
segment.adjustOffset(overlap2);
segment.adjustOffset(overlap);
}
// Return true if the segment is empty.
if (segment.length() == 0) {
segment.setOffset(initialOffset);
return true;
}
// Allow ignorables after the sign.
// Note: call site is guarded by the segment.length() check above.
// Note: the ignorables matcher should not touch the result.
fIgnorablesMatcher.match(segment, result, status);
if (segment.length() == 0) {
segment.setOffset(initialOffset);
return true;
}
// We are supposed to accept E0 after NaN, so we need to make sure result.quantity is available.
@ -113,12 +137,12 @@ bool ScientificMatcher::match(StringSegment& segment, ParsedNumber& result, UErr
// At least one exponent digit was matched.
result.flags |= FLAG_HAS_EXPONENT;
} else {
// No exponent digits were matched; un-match the exponent separator.
segment.adjustOffset(-overlap1);
// No exponent digits were matched
segment.setOffset(initialOffset);
}
return digitsReturnValue;
} else if (overlap1 == segment.length()) {
} else if (overlap == segment.length()) {
// Partial exponent separator match
return true;
}

View file

@ -9,6 +9,7 @@
#include "numparse_types.h"
#include "numparse_decimal.h"
#include "numparse_symbols.h"
#include "unicode/numberformatter.h"
using icu::number::impl::Grouper;
@ -32,6 +33,7 @@ class ScientificMatcher : public NumberParseMatcher, public UMemory {
private:
UnicodeString fExponentSeparatorString;
DecimalMatcher fExponentMatcher;
IgnorablesMatcher fIgnorablesMatcher;
UnicodeString fCustomMinusSign;
UnicodeString fCustomPlusSign;
};

View file

@ -69,8 +69,12 @@ UnicodeString SymbolMatcher::toString() const {
}
IgnorablesMatcher::IgnorablesMatcher(unisets::Key key)
: SymbolMatcher({}, key) {
IgnorablesMatcher::IgnorablesMatcher(parse_flags_t parseFlags) :
SymbolMatcher(
{},
(0 != (parseFlags & PARSE_FLAG_STRICT_IGNORABLES)) ?
unisets::STRICT_IGNORABLES :
unisets::DEFAULT_IGNORABLES) {
}
bool IgnorablesMatcher::isFlexible() const {

View file

@ -50,7 +50,7 @@ class U_I18N_API IgnorablesMatcher : public SymbolMatcher {
public:
IgnorablesMatcher() = default; // WARNING: Leaves the object in an unusable state
IgnorablesMatcher(unisets::Key key);
IgnorablesMatcher(parse_flags_t parseFlags);
bool isFlexible() const override;

View file

@ -51,6 +51,7 @@ enum ParseFlags {
// PARSE_FLAG_FORCE_BIG_DECIMAL = 0x1000, // not used in ICU4C
PARSE_FLAG_NO_FOREIGN_CURRENCY = 0x2000,
PARSE_FLAG_ALLOW_INFINITE_RECURSION = 0x4000,
PARSE_FLAG_STRICT_IGNORABLES = 0x8000,
};

View file

@ -111,6 +111,10 @@ void NumberParserTest::testBasic() {
{3, u"𝟱.𝟭𝟰𝟮E𝟯", u"0", 12, 5142.},
{3, u"𝟱.𝟭𝟰𝟮E-𝟯", u"0", 13, 0.005142},
{3, u"𝟱.𝟭𝟰𝟮e-𝟯", u"0", 13, 0.005142},
{3, u"5.142e+3", u"0", 8, 5142.0 },
{3, u"5.142\u200Ee+3", u"0", 9, 5142.0},
{3, u"5.142e\u200E+3", u"0", 9, 5142.0},
{3, u"5.142e+\u200E3", u"0", 9, 5142.0},
{7, u"5,142.50 Canadian dollars", u"#,##,##0 ¤¤¤", 25, 5142.5},
{3, u"a$ b5", u"a ¤ b0", 5, 5.0},
{3, u"📺1.23", u"📺0;📻0", 6, 1.23},
@ -188,9 +192,9 @@ void NumberParserTest::testSeriesMatcher() {
}
PlusSignMatcher m0(symbols, false);
MinusSignMatcher m1(symbols, false);
IgnorablesMatcher m2(unisets::DEFAULT_IGNORABLES);
IgnorablesMatcher m2(0);
PercentMatcher m3(symbols);
IgnorablesMatcher m4(unisets::DEFAULT_IGNORABLES);
IgnorablesMatcher m4(0);
ArraySeriesMatcher::MatcherArray matchers(5);
matchers[0] = &m0;
@ -238,7 +242,7 @@ void NumberParserTest::testSeriesMatcher() {
void NumberParserTest::testCombinedCurrencyMatcher() {
IcuTestErrorCode status(*this, "testCombinedCurrencyMatcher");
IgnorablesMatcher ignorables(unisets::DEFAULT_IGNORABLES);
IgnorablesMatcher ignorables(0);
Locale locale = Locale::getEnglish();
DecimalFormatSymbols dfs(locale, status);
@ -305,7 +309,7 @@ void NumberParserTest::testCombinedCurrencyMatcher() {
void NumberParserTest::testAffixPatternMatcher() {
IcuTestErrorCode status(*this, "testAffixPatternMatcher");
Locale locale = Locale::getEnglish();
IgnorablesMatcher ignorables(unisets::DEFAULT_IGNORABLES);
IgnorablesMatcher ignorables(0);
DecimalFormatSymbols dfs(locale, status);
dfs.setSymbol(DecimalFormatSymbols::kCurrencySymbol, u"IU$", status);

View file

@ -1749,7 +1749,13 @@ gbp123 123 GBP C
British pounds 123 123 GBP H
british POUNDS 123 123 GBP H
test parse scientific with bidi marks
begin
locale parse output breaks
en 4E\u200E+02 400 HK
en 4E+02 400 K
he 4E\u200E+02 400 K
he 4E+02 400 HK

View file

@ -12,15 +12,18 @@ import com.ibm.icu.text.UnicodeSet;
*/
public class IgnorablesMatcher extends SymbolMatcher implements NumberParseMatcher.Flexible {
public static final IgnorablesMatcher DEFAULT = new IgnorablesMatcher(
private static final IgnorablesMatcher DEFAULT = new IgnorablesMatcher(
StaticUnicodeSets.get(StaticUnicodeSets.Key.DEFAULT_IGNORABLES));
public static final IgnorablesMatcher STRICT = new IgnorablesMatcher(
private static final IgnorablesMatcher STRICT = new IgnorablesMatcher(
StaticUnicodeSets.get(StaticUnicodeSets.Key.STRICT_IGNORABLES));
public static IgnorablesMatcher getInstance(UnicodeSet ignorables) {
assert ignorables.isFrozen();
return new IgnorablesMatcher(ignorables);
public static IgnorablesMatcher getInstance(int parseFlags) {
if (0 != (parseFlags & ParsingUtils.PARSE_FLAG_STRICT_IGNORABLES)) {
return STRICT;
} else {
return DEFAULT;
}
}
private IgnorablesMatcher(UnicodeSet ignorables) {

View file

@ -42,7 +42,7 @@ public class NumberParserImpl {
NumberParserImpl parser = new NumberParserImpl(parseFlags);
Currency currency = Currency.getInstance("USD");
DecimalFormatSymbols symbols = DecimalFormatSymbols.getInstance(locale);
IgnorablesMatcher ignorables = IgnorablesMatcher.DEFAULT;
IgnorablesMatcher ignorables = IgnorablesMatcher.getInstance(parseFlags);
AffixTokenMatcherFactory factory = new AffixTokenMatcherFactory();
factory.currency = currency;
@ -165,6 +165,7 @@ public class NumberParserImpl {
parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_SEPARATORS;
parseFlags |= ParsingUtils.PARSE_FLAG_USE_FULL_AFFIXES;
parseFlags |= ParsingUtils.PARSE_FLAG_EXACT_AFFIX;
parseFlags |= ParsingUtils.PARSE_FLAG_STRICT_IGNORABLES;
} else {
parseFlags |= ParsingUtils.PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES;
}
@ -177,9 +178,9 @@ public class NumberParserImpl {
if (!parseCurrency) {
parseFlags |= ParsingUtils.PARSE_FLAG_NO_FOREIGN_CURRENCIES;
}
IgnorablesMatcher ignorables = isStrict ? IgnorablesMatcher.STRICT : IgnorablesMatcher.DEFAULT;
NumberParserImpl parser = new NumberParserImpl(parseFlags);
IgnorablesMatcher ignorables = IgnorablesMatcher.getInstance(parseFlags);
AffixTokenMatcherFactory factory = new AffixTokenMatcherFactory();
factory.currency = currency;

View file

@ -25,6 +25,7 @@ public class ParsingUtils {
public static final int PARSE_FLAG_FORCE_BIG_DECIMAL = 0x1000;
public static final int PARSE_FLAG_NO_FOREIGN_CURRENCIES = 0x2000;
public static final int PARSE_FLAG_ALLOW_INFINITE_RECURSION = 0x4000;
public static final int PARSE_FLAG_STRICT_IGNORABLES = 0x8000;
public static void putLeadCodePoints(UnicodeSet input, UnicodeSet output) {
for (EntryRange range : input.ranges()) {

View file

@ -19,6 +19,7 @@ public class ScientificMatcher implements NumberParseMatcher {
private final String exponentSeparatorString;
private final DecimalMatcher exponentMatcher;
private final IgnorablesMatcher ignorablesMatcher;
private final String customMinusSign;
private final String customPlusSign;
@ -32,6 +33,7 @@ public class ScientificMatcher implements NumberParseMatcher {
exponentMatcher = DecimalMatcher.getInstance(symbols,
grouper,
ParsingUtils.PARSE_FLAG_INTEGER_ONLY | ParsingUtils.PARSE_FLAG_GROUPING_DISABLED);
ignorablesMatcher = IgnorablesMatcher.getInstance(ParsingUtils.PARSE_FLAG_STRICT_IGNORABLES);
String minusSign = symbols.getMinusSignString();
customMinusSign = safeContains(minusSignSet(), minusSign) ? null : minusSign;
@ -61,15 +63,24 @@ public class ScientificMatcher implements NumberParseMatcher {
// First match the scientific separator, and then match another number after it.
// NOTE: This is guarded by the smoke test; no need to check exponentSeparatorString length again.
int overlap1 = segment.getCommonPrefixLength(exponentSeparatorString);
if (overlap1 == exponentSeparatorString.length()) {
int initialOffset = segment.getOffset();
int overlap = segment.getCommonPrefixLength(exponentSeparatorString);
if (overlap == exponentSeparatorString.length()) {
// Full exponent separator match.
// First attempt to get a code point, returning true if we can't get one.
if (segment.length() == overlap1) {
if (segment.length() == overlap) {
return true;
}
segment.adjustOffset(overlap);
// Allow ignorables before the sign.
// Note: call site is guarded by the segment.length() check above.
ignorablesMatcher.match(segment, null);
if (segment.length() == 0) {
segment.setOffset(initialOffset);
return true;
}
segment.adjustOffset(overlap1);
// Allow a sign, and then try to match digits.
int exponentSign = 1;
@ -79,24 +90,36 @@ public class ScientificMatcher implements NumberParseMatcher {
} else if (segment.startsWith(plusSignSet())) {
segment.adjustOffsetByCodePoint();
} else if (segment.startsWith(customMinusSign)) {
// Note: call site is guarded with startsWith, which returns false on empty string
int overlap2 = segment.getCommonPrefixLength(customMinusSign);
if (overlap2 != customMinusSign.length()) {
// Partial custom sign match; un-match the exponent separator.
segment.adjustOffset(-overlap1);
overlap = segment.getCommonPrefixLength(customMinusSign);
if (overlap != customMinusSign.length()) {
// Partial custom sign match
segment.setOffset(initialOffset);
return true;
}
exponentSign = -1;
segment.adjustOffset(overlap2);
segment.adjustOffset(overlap);
} else if (segment.startsWith(customPlusSign)) {
// Note: call site is guarded with startsWith, which returns false on empty string
int overlap2 = segment.getCommonPrefixLength(customPlusSign);
if (overlap2 != customPlusSign.length()) {
// Partial custom sign match; un-match the exponent separator.
segment.adjustOffset(-overlap1);
overlap = segment.getCommonPrefixLength(customPlusSign);
if (overlap != customPlusSign.length()) {
// Partial custom sign match
segment.setOffset(initialOffset);
return true;
}
segment.adjustOffset(overlap2);
segment.adjustOffset(overlap);
}
// Return true if the segment is empty.
if (segment.length() == 0) {
segment.setOffset(initialOffset);
return true;
}
// Allow ignorables after the sign.
// Note: call site is guarded by the segment.length() check above.
ignorablesMatcher.match(segment, null);
if (segment.length() == 0) {
segment.setOffset(initialOffset);
return true;
}
// We are supposed to accept E0 after NaN, so we need to make sure result.quantity is available.
@ -114,12 +137,12 @@ public class ScientificMatcher implements NumberParseMatcher {
// At least one exponent digit was matched.
result.flags |= ParsedNumber.FLAG_HAS_EXPONENT;
} else {
// No exponent digits were matched; un-match the exponent separator.
segment.adjustOffset(-overlap1);
// No exponent digits were matched
segment.setOffset(initialOffset);
}
return digitsReturnValue;
} else if (overlap1 == segment.length()) {
} else if (overlap == segment.length()) {
// Partial exponent separator match
return true;
}

View file

@ -1749,7 +1749,13 @@ gbp123 123 GBP C
British pounds 123 123 GBP H
british POUNDS 123 123 GBP H
test parse scientific with bidi marks
begin
locale parse output breaks
en 4E\u200E+02 400 HK
en 4E+02 400 K
he 4E\u200E+02 400 K
he 4E+02 400 HK

View file

@ -118,6 +118,10 @@ public class NumberParserTest {
{ 3, "𝟱.𝟭𝟰𝟮E𝟯", "0", 12, 5142. },
{ 3, "𝟱.𝟭𝟰𝟮E-𝟯", "0", 13, 0.005142 },
{ 3, "𝟱.𝟭𝟰𝟮e-𝟯", "0", 13, 0.005142 },
{ 3, "5.142e+3", "0", 8, 5142.0 },
{ 3, "5.142\u200Ee+3", "0", 9, 5142.0 },
{ 3, "5.142e\u200E+3", "0", 9, 5142.0 },
{ 3, "5.142e+\u200E3", "0", 9, 5142.0 },
{ 7, "5,142.50 Canadian dollars", "#,##,##0 ¤¤¤", 25, 5142.5 },
{ 3, "a$ b5", "a ¤ b0", 5, 5.0 },
{ 3, "📺1.23", "📺0;📻0", 6, 1.23 },
@ -210,9 +214,9 @@ public class NumberParserTest {
SeriesMatcher series = new SeriesMatcher();
series.addMatcher(PlusSignMatcher.getInstance(symbols, false));
series.addMatcher(MinusSignMatcher.getInstance(symbols, false));
series.addMatcher(IgnorablesMatcher.DEFAULT);
series.addMatcher(IgnorablesMatcher.getInstance(0));
series.addMatcher(PercentMatcher.getInstance(symbols));
series.addMatcher(IgnorablesMatcher.DEFAULT);
series.addMatcher(IgnorablesMatcher.getInstance(0));
series.freeze();
assertFalse(series.smokeTest(new StringSegment("x", false)));
@ -305,7 +309,7 @@ public class NumberParserTest {
AffixTokenMatcherFactory factory = new AffixTokenMatcherFactory();
factory.currency = Currency.getInstance("EUR");
factory.symbols = DecimalFormatSymbols.getInstance(ULocale.ENGLISH);
factory.ignorables = IgnorablesMatcher.DEFAULT;
factory.ignorables = IgnorablesMatcher.getInstance(0);
factory.locale = ULocale.ENGLISH;
factory.parseFlags = 0;