ICU-13783 Re-writing grouping parsing logic. Same behavior but passes more corner cases.

X-SVN-Rev: 41427
This commit is contained in:
Shane Carr 2018-05-22 02:46:49 +00:00
parent ebca759ea1
commit 33a0fa7172
13 changed files with 610 additions and 365 deletions

View file

@ -228,6 +228,9 @@ bool DecimalQuantity::adjustMagnitude(int32_t delta) {
// i.e., scale += delta; origDelta += delta
bool overflow = uprv_add32_overflow(scale, delta, &scale);
overflow = uprv_add32_overflow(origDelta, delta, &origDelta) || overflow;
// Make sure that precision + scale won't overflow, either
int32_t dummy;
overflow = overflow || uprv_add32_overflow(scale, precision, &dummy);
return overflow;
}
return false;

View file

@ -15,6 +15,7 @@
#include "numparse_utils.h"
#include "unicode/uchar.h"
#include "putilimp.h"
#include "number_decimalquantity.h"
using namespace icu;
using namespace icu::numparse;
@ -79,11 +80,13 @@ DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Groupe
requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
fractionGroupingDisabled = 0 != (
parseFlags & PARSE_FLAG_FRACTION_GROUPING_DISABLED);
integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
grouping1 = grouper.getPrimary();
grouping2 = grouper.getSecondary();
// Fraction grouping parsing is disabled for now but could be enabled later.
// See http://bugs.icu-project.org/trac/ticket/10794
// fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
}
bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
@ -100,30 +103,55 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
U_ASSERT(!result.quantity.bogus);
}
ParsedNumber backupResult(result);
// strict parsing
bool strictFail = false; // did we exit with a strict parse failure?
UnicodeString actualGroupingString = groupingSeparator;
UnicodeString actualDecimalString = decimalSeparator;
int32_t groupedDigitCount = 0; // tracking count of digits delimited by grouping separator
int32_t backupOffset = -1; // used for preserving the last confirmed position
int32_t smallGroupBackupOffset = -1; // used to back up behind groups of size 1
bool afterFirstGrouping = false;
bool seenGrouping = false;
bool seenDecimal = false;
int32_t digitsAfterDecimal = 0;
// Initial offset before any character consumption.
int32_t initialOffset = segment.getOffset();
int32_t exponent = 0;
bool hasPartialPrefix = false;
// Return value: whether to ask for more characters.
bool maybeMore = false;
// All digits consumed so far.
number::impl::DecimalQuantity digitsConsumed;
digitsConsumed.bogus = true;
// The total number of digits after the decimal place, used for scaling the result.
int32_t digitsAfterDecimalPlace = 0;
// The actual grouping and decimal separators used in the string.
// If non-null, we have seen that token.
UnicodeString actualGroupingString;
UnicodeString actualDecimalString;
actualGroupingString.setToBogus();
actualDecimalString.setToBogus();
// Information for two groups: the previous group and the current group.
//
// Each group has three pieces of information:
//
// Offset: the string position of the beginning of the group, including a leading separator
// if there was a leading separator. This is needed in case we need to rewind the parse to
// that position.
//
// Separator type:
// 0 => beginning of string
// 1 => lead separator is a grouping separator
// 2 => lead separator is a decimal separator
//
// Count: the number of digits in the group. If -1, the group has been validated.
int32_t currGroupOffset = 0;
int32_t currGroupSepType = 0;
int32_t currGroupCount = 0;
int32_t prevGroupOffset = -1;
int32_t prevGroupSepType = -1;
int32_t prevGroupCount = -1;
while (segment.length() > 0) {
hasPartialPrefix = false;
maybeMore = false;
// Attempt to match a digit.
int8_t digit = -1;
// Try by code point digit value.
int cp = segment.getCodePoint();
UChar32 cp = segment.getCodePoint();
if (u_isdigit(cp)) {
segment.adjustOffset(U16_LENGTH(cp));
digit = static_cast<int8_t>(u_digit(cp, 10));
@ -138,188 +166,207 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
segment.adjustOffset(overlap);
digit = static_cast<int8_t>(i);
break;
} else if (overlap == segment.length()) {
hasPartialPrefix = true;
}
maybeMore = maybeMore || (overlap == segment.length());
}
}
if (digit >= 0) {
// Digit was found.
// Check for grouping size violation
if (backupOffset != -1) {
smallGroupBackupOffset = backupOffset;
backupOffset = -1;
if (requireGroupingMatch) {
// comma followed by digit, so group before comma is a secondary
// group. If there was a group separator before that, the group
// must == the secondary group length, else it can be <= the the
// secondary group length.
if ((afterFirstGrouping && groupedDigitCount != grouping2) ||
(!afterFirstGrouping && groupedDigitCount > grouping2)) {
strictFail = true;
break;
}
} else {
// #11230: don't accept groups after the first with only 1 digit.
// The logic to back up and remove the lone digit is lower down.
if (afterFirstGrouping && groupedDigitCount == 1) {
break;
}
}
afterFirstGrouping = true;
groupedDigitCount = 0;
if (digitsConsumed.bogus) {
digitsConsumed.bogus = false;
digitsConsumed.clear();
}
// Save the digit in the DecimalQuantity or scientific adjustment.
if (exponentSign != 0) {
int32_t nextExponent;
// i.e., nextExponent = exponent * 10 + digit
UBool overflow = uprv_mul32_overflow(exponent, 10, &nextExponent) ||
uprv_add32_overflow(nextExponent, digit, &nextExponent);
if (overflow) {
exponent = INT32_MAX;
} else {
exponent = nextExponent;
}
} else {
if (result.quantity.bogus) {
result.quantity.bogus = false;
}
result.quantity.appendDigit(digit, 0, true);
}
result.setCharsConsumed(segment);
groupedDigitCount++;
if (seenDecimal) {
digitsAfterDecimal++;
digitsConsumed.appendDigit(digit, 0, true);
currGroupCount++;
if (!actualDecimalString.isBogus()) {
digitsAfterDecimalPlace++;
}
continue;
}
// Attempt to match a literal grouping or decimal separator
int32_t decimalOverlap = segment.getCommonPrefixLength(actualDecimalString);
bool decimalStringMatch = decimalOverlap == actualDecimalString.length();
int32_t groupingOverlap = segment.getCommonPrefixLength(actualGroupingString);
bool groupingStringMatch = groupingOverlap == actualGroupingString.length();
// Attempt to match a literal grouping or decimal separator.
bool isDecimal = false;
bool isGrouping = false;
hasPartialPrefix = (decimalOverlap == segment.length()) || (groupingOverlap == segment.length());
if (!seenDecimal && !groupingStringMatch &&
(decimalStringMatch || (!seenDecimal && decimalUniSet->contains(cp)))) {
// matched a decimal separator
if (requireGroupingMatch) {
if (backupOffset != -1 || (seenGrouping && groupedDigitCount != grouping1)) {
strictFail = true;
break;
}
// 1) Attempt the decimal separator string literal.
// if (we have not seen a decimal separator yet) { ... }
if (actualDecimalString.isBogus()) {
int overlap = segment.getCommonPrefixLength(decimalSeparator);
maybeMore = maybeMore || (overlap == segment.length());
if (overlap == decimalSeparator.length()) {
isDecimal = true;
actualDecimalString = decimalSeparator;
}
}
// If we're only parsing integers, then don't parse this one.
if (integerOnly) {
break;
// 2) Attempt to match the actual grouping string literal.
if (!actualGroupingString.isBogus()) {
int overlap = segment.getCommonPrefixLength(actualGroupingString);
maybeMore = maybeMore || (overlap == segment.length());
if (overlap == actualGroupingString.length()) {
isGrouping = true;
}
}
seenDecimal = true;
if (!decimalStringMatch) {
// 2.5) Attempt to match a new the grouping separator string literal.
// if (we have not seen a grouping or decimal separator yet) { ... }
if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
int overlap = segment.getCommonPrefixLength(groupingSeparator);
maybeMore = maybeMore || (overlap == segment.length());
if (overlap == groupingSeparator.length()) {
isGrouping = true;
actualGroupingString = groupingSeparator;
}
}
// 3) Attempt to match a decimal separator from the equivalence set.
// if (we have not seen a decimal separator yet) { ... }
// The !isGrouping is to confirm that we haven't yet matched the current character.
if (!isGrouping && actualDecimalString.isBogus()) {
if (decimalUniSet->contains(cp)) {
isDecimal = true;
actualDecimalString = UnicodeString(cp);
}
segment.adjustOffset(actualDecimalString.length());
result.setCharsConsumed(segment);
result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
continue;
}
if (!groupingDisabled && !decimalStringMatch &&
(groupingStringMatch || (!seenGrouping && groupingUniSet->contains(cp)))) {
// matched a grouping separator
if (requireGroupingMatch) {
if (groupedDigitCount == 0) {
// leading group
strictFail = true;
break;
} else if (backupOffset != -1) {
// two group separators in a row
break;
}
}
if (fractionGroupingDisabled && seenDecimal) {
// Stop parsing here.
break;
}
seenGrouping = true;
if (!groupingStringMatch) {
// 4) Attempt to match a grouping separator from the equivalence set.
// if (we have not seen a grouping or decimal separator yet) { ... }
if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
if (groupingUniSet->contains(cp)) {
isGrouping = true;
actualGroupingString = UnicodeString(cp);
}
backupOffset = segment.getOffset();
segment.adjustOffset(actualGroupingString.length());
// Note: do NOT set charsConsumed
continue;
}
// Not a digit and not a separator
break;
}
// Back up if there was a trailing grouping separator
if (backupOffset != -1) {
segment.setOffset(backupOffset);
hasPartialPrefix = true; // redundant with `groupingOverlap == segment.length()`
}
// Check the final grouping for validity
if (requireGroupingMatch && !seenDecimal && seenGrouping && afterFirstGrouping &&
groupedDigitCount != grouping1) {
strictFail = true;
}
// #11230: don't accept groups after the first with only 1 digit.
// Behavior in this case is to back up before that 1-digit group.
if (!seenDecimal && afterFirstGrouping && groupedDigitCount == 1) {
if (segment.length() == 0) {
// Strings like "9,999" where we looked at only the first 3 chars.
// Ask for a longer segment.
hasPartialPrefix = true;
// Leave if we failed to match this as a separator.
if (!isDecimal && !isGrouping) {
break;
}
segment.setOffset(smallGroupBackupOffset);
result.setCharsConsumed(segment);
if (smallGroupBackupOffset == initialOffset) {
// Strings like ",9"
// Reset to no quantity seen.
result.quantity.clear();
result.quantity.bogus = true;
// Check for conditions when we don't want to accept the separator.
if (isDecimal && integerOnly) {
break;
} else if (currGroupSepType == 2 && isGrouping) {
// Fraction grouping
break;
}
// Validate intermediate grouping sizes.
bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
// Invalid grouping sizes.
if (isGrouping && currGroupCount == 0) {
// Trailing grouping separators: these are taken care of below
U_ASSERT(currGroupSepType == 1);
} else if (requireGroupingMatch) {
// Strict mode: reject the parse
digitsConsumed.clear();
digitsConsumed.bogus = true;
}
break;
} else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
break;
} else {
// Strings like "9,9"
// Remove the lone digit from the result quantity.
U_ASSERT(!result.quantity.bogus);
result.quantity.adjustMagnitude(-1);
result.quantity.truncate();
// Grouping sizes OK so far.
prevGroupOffset = currGroupOffset;
prevGroupCount = currGroupCount;
if (isDecimal) {
// Do not validate this group any more.
prevGroupSepType = -1;
} else {
prevGroupSepType = currGroupSepType;
}
}
// OK to accept the separator.
// Special case: don't update currGroup if it is empty; this allows two grouping
// separators in a row in lenient mode.
if (currGroupCount != 0) {
currGroupOffset = segment.getOffset();
}
currGroupSepType = isGrouping ? 1 : 2;
currGroupCount = 0;
if (isGrouping) {
segment.adjustOffset(actualGroupingString.length());
} else {
segment.adjustOffset(actualDecimalString.length());
}
}
if (requireGroupingMatch && strictFail) {
result = backupResult;
// End of main loop.
// Back up if there was a trailing grouping separator.
// Shift prev -> curr so we can check it as a final group.
if (currGroupSepType != 2 && currGroupCount == 0) {
maybeMore = true;
segment.setOffset(currGroupOffset);
currGroupOffset = prevGroupOffset;
currGroupSepType = prevGroupSepType;
currGroupCount = prevGroupCount;
prevGroupOffset = -1;
prevGroupSepType = 0;
prevGroupCount = 1;
}
// Validate final grouping sizes.
bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
if (!requireGroupingMatch) {
// The cases we need to handle here are lone digits.
// Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
// See more examples in numberformattestspecification.txt
int digitsToRemove = 0;
if (!prevValidSecondary) {
segment.setOffset(prevGroupOffset);
digitsToRemove += prevGroupCount;
digitsToRemove += currGroupCount;
} else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
maybeMore = true;
segment.setOffset(currGroupOffset);
digitsToRemove += currGroupCount;
}
if (digitsToRemove != 0) {
digitsConsumed.adjustMagnitude(-digitsToRemove);
digitsConsumed.truncate();
}
prevValidSecondary = true;
currValidPrimary = true;
}
if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
// Grouping failure.
digitsConsumed.bogus = true;
}
// Strings that start with a separator but have no digits,
// or strings that failed a grouping size check.
if (digitsConsumed.bogus) {
maybeMore = maybeMore || (segment.length() == 0);
segment.setOffset(initialOffset);
return maybeMore;
}
if (result.quantity.bogus && segment.getOffset() != initialOffset) {
// Strings that start with a separator but have no digits.
// We don't need a backup of ParsedNumber because no changes could have been made to it.
segment.setOffset(initialOffset);
hasPartialPrefix = true;
}
// We passed all inspections. Start post-processing.
if (!result.quantity.bogus) {
// The final separator was a decimal separator.
result.quantity.adjustMagnitude(-digitsAfterDecimal);
}
// Adjust for fraction part.
digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
// Set the digits, either normal or exponent.
if (exponentSign != 0 && segment.getOffset() != initialOffset) {
U_ASSERT(!result.quantity.bogus);
bool overflow = (exponent == INT32_MAX);
if (!overflow) {
overflow = result.quantity.adjustMagnitude(exponentSign * exponent);
bool overflow = false;
if (digitsConsumed.fitsInLong()) {
long exponentLong = digitsConsumed.toLong(false);
U_ASSERT(exponentLong >= 0);
if (exponentLong <= INT32_MAX) {
auto exponentInt = static_cast<int32_t>(exponentLong);
if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
overflow = true;
}
} else {
overflow = true;
}
} else {
overflow = true;
}
if (overflow) {
if (exponentSign == -1) {
@ -331,9 +378,51 @@ bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t
result.flags |= FLAG_INFINITY;
}
}
} else {
result.quantity = digitsConsumed;
}
return segment.length() == 0 || hasPartialPrefix;
// Set other information into the result and return.
if (!actualDecimalString.isBogus()) {
result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
}
result.setCharsConsumed(segment);
return segment.length() == 0 || maybeMore;
}
bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
if (requireGroupingMatch) {
if (sepType == -1) {
// No such group (prevGroup before first shift).
return true;
} else if (sepType == 0) {
// First group.
if (isPrimary) {
// No grouping separators is OK.
return true;
} else {
return count != 0 && count <= grouping2;
}
} else if (sepType == 1) {
// Middle group.
if (isPrimary) {
return count == grouping1;
} else {
return count == grouping2;
}
} else {
U_ASSERT(sepType == 2);
// After the decimal separator.
return true;
}
} else {
if (sepType == 1) {
// #11230: don't accept middle groups with only 1 digit.
return count != 1;
} else {
return true;
}
}
}
bool DecimalMatcher::smokeTest(const StringSegment& segment) const {

View file

@ -38,8 +38,9 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory {
/** If true, do not accept grouping separators at all */
bool groupingDisabled;
/** If true, do not accept fraction grouping separators */
bool fractionGroupingDisabled;
// Fraction grouping parsing is disabled for now but could be enabled later.
// See http://bugs.icu-project.org/trac/ticket/10794
// bool fractionGrouping;
/** If true, do not accept numbers in the fraction */
bool integerOnly;
@ -62,6 +63,8 @@ class DecimalMatcher : public NumberParseMatcher, public UMemory {
LocalPointer<const UnicodeSet> fLocalDecimalUniSet;
LocalPointer<const UnicodeSet> fLocalSeparatorSet;
LocalArray<const UnicodeString> fLocalDigitStrings;
bool validateGroup(int32_t sepType, int32_t count, bool isPrimary) const;
};

View file

@ -101,8 +101,6 @@ NumberParserImpl::createParserFromProperties(const number::impl::DecimalFormatPr
Grouper grouper = Grouper::forProperties(properties);
int parseFlags = 0;
if (affixProvider == nullptr || U_FAILURE(status)) { return nullptr; }
// Fraction grouping is disabled by default because it has never been supported in DecimalFormat
parseFlags |= PARSE_FLAG_FRACTION_GROUPING_DISABLED;
if (!properties.parseCaseSensitive) {
parseFlags |= PARSE_FLAG_IGNORE_CASE;
}

View file

@ -41,12 +41,13 @@ enum ParseFlags {
PARSE_FLAG_STRICT_GROUPING_SIZE = 0x0008,
PARSE_FLAG_INTEGER_ONLY = 0x0010,
PARSE_FLAG_GROUPING_DISABLED = 0x0020,
PARSE_FLAG_FRACTION_GROUPING_DISABLED = 0x0040,
// PARSE_FLAG_FRACTION_GROUPING_ENABLED = 0x0040, // see #10794
PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES = 0x0080,
PARSE_FLAG_USE_FULL_AFFIXES = 0x0100,
PARSE_FLAG_EXACT_AFFIX = 0x0200,
PARSE_FLAG_PLUS_SIGN_ALLOWED = 0x0400,
// PARSE_FLAG_OPTIMIZE = 0x0800, // no longer used
// PARSE_FLAG_FORCE_BIG_DECIMAL = 0x1000, // not used in ICU4C
};

View file

@ -48,6 +48,13 @@ void NumberParserTest::testBasic() {
{7, u"51,423", u"#,##,##0", 6, 51423.},
{7, u" 51,423", u"#,##,##0", 7, 51423.},
{7, u"51,423 ", u"#,##,##0", 6, 51423.},
{7, u"51,423,", u"#,##,##0", 6, 51423.},
{7, u"51,423,,", u"#,##,##0", 6, 51423.},
{7, u"51,423.5", u"#,##,##0", 8, 51423.5},
{7, u"51,423.5,", u"#,##,##0", 8, 51423.5},
{7, u"51,423.5,,", u"#,##,##0", 8, 51423.5},
{7, u"51,423.5.", u"#,##,##0", 8, 51423.5},
{7, u"51,423.5..", u"#,##,##0", 8, 51423.5},
{7, u"𝟱𝟭,𝟰𝟮𝟯", u"#,##,##0", 11, 51423.},
{7, u"𝟳,𝟴𝟵,𝟱𝟭,𝟰𝟮𝟯", u"#,##,##0", 19, 78951423.},
{7, u"𝟳𝟴,𝟵𝟱𝟭.𝟰𝟮𝟯", u"#,##,##0", 18, 78951.423},
@ -55,6 +62,18 @@ void NumberParserTest::testBasic() {
{7, u"𝟳𝟴,𝟬𝟬𝟬.𝟬𝟬𝟬", u"#,##,##0", 18, 78000.},
{7, u"𝟳𝟴,𝟬𝟬𝟬.𝟬𝟮𝟯", u"#,##,##0", 18, 78000.023},
{7, u"𝟳𝟴.𝟬𝟬𝟬.𝟬𝟮𝟯", u"#,##,##0", 11, 78.},
{7, u"1,", u"#,##,##0", 1, 1.},
{7, u"1,,", u"#,##,##0", 1, 1.},
{7, u"1.,", u"#,##,##0", 2, 1.},
{3, u"1,.", u"#,##,##0", 3, 1.},
{7, u"1..", u"#,##,##0", 2, 1.},
{3, u",1", u"#,##,##0", 2, 1.},
{3, u"1,1", u"#,##,##0", 1, 1.},
{3, u"1,1,", u"#,##,##0", 1, 1.},
{3, u"1,1,,", u"#,##,##0", 1, 1.},
{3, u"1,1,1", u"#,##,##0", 1, 1.},
{3, u"1,1,1,", u"#,##,##0", 1, 1.},
{3, u"1,1,1,,", u"#,##,##0", 1, 1.},
{3, u"-51423", u"0", 6, -51423.},
{3, u"51423-", u"0", 5, 51423.}, // plus and minus sign by default do NOT match after
{3, u"+51423", u"0", 6, 51423.},

View file

@ -760,8 +760,10 @@ parse output breaks
// JDK stops parsing at the spaces. JDK doesn't see space as a grouping separator
(34 25E-1) -342.5 K
(34,,25E-1) -342.5
// H doesn't allow trailing separators before E but C and P do
(34,,25,E-1) -342.5 CHJP
// Trailing grouping separators are not OK.
// H fails; C/J/P stop at the offending separator.
(34,,25,E-1) fail CJKP
(34,,25,E-1) -3425 HK
(34 25 E-1) -342.5 HK
(34,,25 E-1) -342.5 HK
// Spaces are not allowed after exponent symbol
@ -999,7 +1001,7 @@ parse output breaks
१३ 13
१३.३१‍ 13.31
123'456 123456
524'1.3 5241.3
524'11.3 52411.3
३'११‍ 311
test parse with European-style comma/period
@ -1442,8 +1444,8 @@ NaN NaN K
1E2147483646 1E+2147483646 HJK
1E-2147483649 0
1E-2147483648 0
// H, K, C and P return zero here
1E-2147483647 1E-2147483647 CHJKP
// H and K return zero here
1E-2147483647 1E-2147483647 HJK
1E-2147483646 1E-2147483646 HJK
test format push limits
@ -1476,26 +1478,43 @@ pattern lenient parse output breaks
#,##0 1 9 99 999 K
#,##0 1 9 999 9999 K
#,##0 1 9 9 9 9 H
#,##0 1 ,9 fail HK
#,##0 1 ,9 9
#,##0 1 99,.0 99
#,##0 1 9 9. 9 H
#,##0 1 9 99. 999 K
0 1 9 9 9
0 1 9 99 9
0 1 9 999 9
0 1 9 9 9 9
0 1 ,9 fail
0 1 99,.0 99
0 1 9 9. 9
0 1 9 99. 9
#,##0 0 9 9 fail K
#,##0 0 9 99 fail K
#,##0 0 9 999 9999 K
#,##0 0 9 9 9 fail K
#,##0 0 ,9 fail K
#,##0 0 99,.0 fail K
#,##0 0 9 9. fail K
#,##0 0 9 99. fail K
0 0 9 9 9
0 0 9 99 9
0 0 9 999 9
0 0 9 9 9 9
0 0 ,9 fail
0 0 99,.0 99
0 0 9 9. 9
0 0 9 99. 9
test more strict grouping parse
set locale en
set pattern #,##,##0
begin
lenient parse output breaks
1 1,23,, 123
0 9999, 9999
0 1,23,, fail K
test parse ignorables
set locale ar

View file

@ -23,8 +23,9 @@ public class DecimalMatcher implements NumberParseMatcher {
/** If true, do not accept grouping separators at all */
private final boolean groupingDisabled;
/** If true, do not accept fraction grouping separators */
private final boolean fractionGroupingDisabled;
// Fraction grouping parsing is disabled for now but could be enabled later.
// See http://bugs.icu-project.org/trac/ticket/10794
// private final boolean fractionGrouping;
/** If true, do not accept numbers in the fraction */
private final boolean integerOnly;
@ -93,11 +94,13 @@ public class DecimalMatcher implements NumberParseMatcher {
requireGroupingMatch = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_STRICT_GROUPING_SIZE);
groupingDisabled = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_GROUPING_DISABLED);
fractionGroupingDisabled = 0 != (parseFlags
& ParsingUtils.PARSE_FLAG_FRACTION_GROUPING_DISABLED);
integerOnly = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_INTEGER_ONLY);
grouping1 = grouper.getPrimary();
grouping2 = grouper.getSecondary();
// Fraction grouping parsing is disabled for now but could be enabled later.
// See http://bugs.icu-project.org/trac/ticket/10794
// fractionGrouping = 0 != (parseFlags & ParsingUtils.PARSE_FLAG_FRACTION_GROUPING_ENABLED);
}
@Override
@ -120,28 +123,46 @@ public class DecimalMatcher implements NumberParseMatcher {
assert result.quantity != null;
}
ParsedNumber backupResult = null;
if (requireGroupingMatch) {
backupResult = new ParsedNumber();
backupResult.copyFrom(result);
}
// strict parsing
boolean strictFail = false; // did we exit with a strict parse failure?
String actualGroupingString = groupingSeparator;
String actualDecimalString = decimalSeparator;
int groupedDigitCount = 0; // tracking count of digits delimited by grouping separator
int backupOffset = -1; // used for preserving the last confirmed position
int smallGroupBackupOffset = -1; // used to back up behind groups of size 1
boolean afterFirstGrouping = false;
boolean seenGrouping = false;
boolean seenDecimal = false;
int digitsAfterDecimal = 0;
// Initial offset before any character consumption.
int initialOffset = segment.getOffset();
int exponent = 0;
boolean hasPartialPrefix = false;
// Return value: whether to ask for more characters.
boolean maybeMore = false;
// All digits consumed so far.
DecimalQuantity_DualStorageBCD digitsConsumed = null;
// The total number of digits after the decimal place, used for scaling the result.
int digitsAfterDecimalPlace = 0;
// The actual grouping and decimal separators used in the string.
// If non-null, we have seen that token.
String actualGroupingString = null;
String actualDecimalString = null;
// Information for two groups: the previous group and the current group.
//
// Each group has three pieces of information:
//
// Offset: the string position of the beginning of the group, including a leading separator
// if there was a leading separator. This is needed in case we need to rewind the parse to
// that position.
//
// Separator type:
// 0 => beginning of string
// 1 => lead separator is a grouping separator
// 2 => lead separator is a decimal separator
//
// Count: the number of digits in the group. If -1, the group has been validated.
int currGroupOffset = 0;
int currGroupSepType = 0;
int currGroupCount = 0;
int prevGroupOffset = -1;
int prevGroupSepType = -1;
int prevGroupCount = -1;
while (segment.length() > 0) {
hasPartialPrefix = false;
maybeMore = false;
// Attempt to match a digit.
byte digit = -1;
@ -162,194 +183,207 @@ public class DecimalMatcher implements NumberParseMatcher {
segment.adjustOffset(overlap);
digit = (byte) i;
break;
} else if (overlap == segment.length()) {
hasPartialPrefix = true;
}
maybeMore = maybeMore || (overlap == segment.length());
}
}
if (digit >= 0) {
// Digit was found.
// Check for grouping size violation
if (backupOffset != -1) {
smallGroupBackupOffset = backupOffset;
backupOffset = -1;
if (requireGroupingMatch) {
// comma followed by digit, so group before comma is a secondary
// group. If there was a group separator before that, the group
// must == the secondary group length, else it can be <= the the
// secondary group length.
if ((afterFirstGrouping && groupedDigitCount != grouping2)
|| (!afterFirstGrouping && groupedDigitCount > grouping2)) {
strictFail = true;
break;
}
} else {
// #11230: don't accept groups after the first with only 1 digit.
// The logic to back up and remove the lone digit is lower down.
if (afterFirstGrouping && groupedDigitCount == 1) {
break;
}
}
afterFirstGrouping = true;
groupedDigitCount = 0;
if (digitsConsumed == null) {
digitsConsumed = new DecimalQuantity_DualStorageBCD();
}
// Save the digit in the DecimalQuantity or scientific adjustment.
if (exponentSign != 0) {
int nextExponent = digit + exponent * 10;
if (nextExponent < exponent) {
// Overflow
exponent = Integer.MAX_VALUE;
} else {
exponent = nextExponent;
}
} else {
if (result.quantity == null) {
result.quantity = new DecimalQuantity_DualStorageBCD();
}
result.quantity.appendDigit(digit, 0, true);
}
result.setCharsConsumed(segment);
groupedDigitCount++;
if (seenDecimal) {
digitsAfterDecimal++;
digitsConsumed.appendDigit(digit, 0, true);
currGroupCount++;
if (actualDecimalString != null) {
digitsAfterDecimalPlace++;
}
continue;
}
// Attempt to match a literal grouping or decimal separator
int decimalOverlap = segment.getCommonPrefixLength(actualDecimalString);
boolean decimalStringMatch = decimalOverlap == actualDecimalString.length();
int groupingOverlap = segment.getCommonPrefixLength(actualGroupingString);
boolean groupingStringMatch = groupingOverlap == actualGroupingString.length();
// Attempt to match a literal grouping or decimal separator.
boolean isDecimal = false;
boolean isGrouping = false;
hasPartialPrefix = (decimalOverlap == segment.length())
|| (groupingOverlap == segment.length());
if (!seenDecimal
&& !groupingStringMatch
&& (decimalStringMatch || (!seenDecimal && decimalUniSet.contains(cp)))) {
// matched a decimal separator
if (requireGroupingMatch) {
if (backupOffset != -1 || (seenGrouping && groupedDigitCount != grouping1)) {
strictFail = true;
break;
}
// 1) Attempt the decimal separator string literal.
// if (we have not seen a decimal separator yet) { ... }
if (actualDecimalString == null) {
int overlap = segment.getCommonPrefixLength(decimalSeparator);
maybeMore = maybeMore || (overlap == segment.length());
if (overlap == decimalSeparator.length()) {
isDecimal = true;
actualDecimalString = decimalSeparator;
}
}
// If we're only parsing integers, then don't parse this one.
if (integerOnly) {
break;
// 2) Attempt to match the actual grouping string literal.
if (actualGroupingString != null) {
int overlap = segment.getCommonPrefixLength(actualGroupingString);
maybeMore = maybeMore || (overlap == segment.length());
if (overlap == actualGroupingString.length()) {
isGrouping = true;
}
}
seenDecimal = true;
if (!decimalStringMatch) {
// 2.5) Attempt to match a new the grouping separator string literal.
// if (we have not seen a grouping or decimal separator yet) { ... }
if (!groupingDisabled && actualGroupingString == null && actualDecimalString == null) {
int overlap = segment.getCommonPrefixLength(groupingSeparator);
maybeMore = maybeMore || (overlap == segment.length());
if (overlap == groupingSeparator.length()) {
isGrouping = true;
actualGroupingString = groupingSeparator;
}
}
// 3) Attempt to match a decimal separator from the equivalence set.
// if (we have not seen a decimal separator yet) { ... }
// The !isGrouping is to confirm that we haven't yet matched the current character.
if (!isGrouping && actualDecimalString == null) {
if (decimalUniSet.contains(cp)) {
isDecimal = true;
actualDecimalString = UCharacter.toString(cp);
}
segment.adjustOffset(actualDecimalString.length());
result.setCharsConsumed(segment);
result.flags |= ParsedNumber.FLAG_HAS_DECIMAL_SEPARATOR;
continue;
}
if (!groupingDisabled
&& !decimalStringMatch
&& (groupingStringMatch || (!seenGrouping && groupingUniSet.contains(cp)))) {
// matched a grouping separator
if (requireGroupingMatch) {
if (groupedDigitCount == 0) {
// leading group
strictFail = true;
break;
} else if (backupOffset != -1) {
// two group separators in a row
break;
}
}
if (fractionGroupingDisabled && seenDecimal) {
// Stop parsing here.
break;
}
seenGrouping = true;
if (!groupingStringMatch) {
// 4) Attempt to match a grouping separator from the equivalence set.
// if (we have not seen a grouping or decimal separator yet) { ... }
if (!groupingDisabled && actualGroupingString == null && actualDecimalString == null) {
if (groupingUniSet.contains(cp)) {
isGrouping = true;
actualGroupingString = UCharacter.toString(cp);
}
backupOffset = segment.getOffset();
segment.adjustOffset(actualGroupingString.length());
// Note: do NOT set charsConsumed
continue;
}
// Not a digit and not a separator
break;
}
// Back up if there was a trailing grouping separator
if (backupOffset != -1) {
segment.setOffset(backupOffset);
hasPartialPrefix = true; // redundant with `groupingOverlap == segment.length()`
}
// Check the final grouping for validity
if (requireGroupingMatch
&& !seenDecimal
&& seenGrouping
&& afterFirstGrouping
&& groupedDigitCount != grouping1) {
strictFail = true;
}
// #11230: don't accept groups after the first with only 1 digit.
// Behavior in this case is to back up before that 1-digit group.
if (!seenDecimal && afterFirstGrouping && groupedDigitCount == 1) {
if (segment.length() == 0) {
// Strings like "9,999" where we looked at only the first 3 chars.
// Ask for a longer segment.
hasPartialPrefix = true;
// Leave if we failed to match this as a separator.
if (!isDecimal && !isGrouping) {
break;
}
segment.setOffset(smallGroupBackupOffset);
result.setCharsConsumed(segment);
if (smallGroupBackupOffset == initialOffset) {
// Strings like ",9"
// Reset to no quantity seen.
result.quantity = null;
// Check for conditions when we don't want to accept the separator.
if (isDecimal && integerOnly) {
break;
} else if (currGroupSepType == 2 && isGrouping) {
// Fraction grouping
break;
}
// Validate intermediate grouping sizes.
boolean prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
boolean currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
// Invalid grouping sizes.
if (isGrouping && currGroupCount == 0) {
// Trailing grouping separators: these are taken care of below
assert currGroupSepType == 1;
} else if (requireGroupingMatch) {
// Strict mode: reject the parse
digitsConsumed = null;
}
break;
} else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
break;
} else {
// Strings like "9,9"
// Remove the lone digit from the result quantity.
assert result.quantity != null;
result.quantity.adjustMagnitude(-1);
result.quantity.truncate();
// Grouping sizes OK so far.
prevGroupOffset = currGroupOffset;
prevGroupCount = currGroupCount;
if (isDecimal) {
// Do not validate this group any more.
prevGroupSepType = -1;
} else {
prevGroupSepType = currGroupSepType;
}
}
// OK to accept the separator.
// Special case: don't update currGroup if it is empty. This is to allow
// adjacent grouping separators in lenient mode: "1,,234"
if (currGroupCount != 0) {
currGroupOffset = segment.getOffset();
}
currGroupSepType = isGrouping ? 1 : 2;
currGroupCount = 0;
if (isGrouping) {
segment.adjustOffset(actualGroupingString.length());
} else {
segment.adjustOffset(actualDecimalString.length());
}
}
if (requireGroupingMatch && strictFail) {
result.copyFrom(backupResult);
// End of main loop.
// Back up if there was a trailing grouping separator.
// Shift prev -> curr so we can check it as a final group.
if (currGroupSepType != 2 && currGroupCount == 0) {
maybeMore = true;
segment.setOffset(currGroupOffset);
currGroupOffset = prevGroupOffset;
currGroupSepType = prevGroupSepType;
currGroupCount = prevGroupCount;
prevGroupOffset = -1;
prevGroupSepType = 0;
prevGroupCount = 1;
}
// Validate final grouping sizes.
boolean prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
boolean currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
if (!requireGroupingMatch) {
// The cases we need to handle here are lone digits.
// Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
// See more examples in numberformattestspecification.txt
int digitsToRemove = 0;
if (!prevValidSecondary) {
segment.setOffset(prevGroupOffset);
digitsToRemove += prevGroupCount;
digitsToRemove += currGroupCount;
} else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
maybeMore = true;
segment.setOffset(currGroupOffset);
digitsToRemove += currGroupCount;
}
if (digitsToRemove != 0) {
digitsConsumed.adjustMagnitude(-digitsToRemove);
digitsConsumed.truncate();
}
prevValidSecondary = true;
currValidPrimary = true;
}
if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
// Grouping failure.
digitsConsumed = null;
}
// Strings that start with a separator but have no digits,
// or strings that failed a grouping size check.
if (digitsConsumed == null) {
maybeMore = maybeMore || (segment.length() == 0);
segment.setOffset(initialOffset);
return maybeMore;
}
if (result.quantity == null && segment.getOffset() != initialOffset) {
// Strings that start with a separator but have no digits.
// We don't need a backup of ParsedNumber because no changes could have been made to it.
segment.setOffset(initialOffset);
hasPartialPrefix = true;
}
// We passed all inspections. Start post-processing.
if (result.quantity != null) {
// The final separator was a decimal separator.
result.quantity.adjustMagnitude(-digitsAfterDecimal);
}
// Adjust for fraction part.
digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
// Set the digits, either normal or exponent.
if (exponentSign != 0 && segment.getOffset() != initialOffset) {
boolean overflow = (exponent == Integer.MAX_VALUE);
if (!overflow) {
try {
result.quantity.adjustMagnitude(exponentSign * exponent);
} catch (ArithmeticException e) {
boolean overflow = false;
if (digitsConsumed.fitsInLong()) {
long exponentLong = digitsConsumed.toLong(false);
assert exponentLong >= 0;
if (exponentLong <= Integer.MAX_VALUE) {
int exponentInt = (int) exponentLong;
try {
result.quantity.adjustMagnitude(exponentSign * exponentInt);
} catch (ArithmeticException e) {
overflow = true;
}
} else {
overflow = true;
}
} else {
overflow = true;
}
if (overflow) {
if (exponentSign == -1) {
@ -361,9 +395,51 @@ public class DecimalMatcher implements NumberParseMatcher {
result.flags |= ParsedNumber.FLAG_INFINITY;
}
}
} else {
result.quantity = digitsConsumed;
}
return segment.length() == 0 || hasPartialPrefix;
// Set other information into the result and return.
if (actualDecimalString != null) {
result.flags |= ParsedNumber.FLAG_HAS_DECIMAL_SEPARATOR;
}
result.setCharsConsumed(segment);
return segment.length() == 0 || maybeMore;
}
private boolean validateGroup(int sepType, int count, boolean isPrimary) {
if (requireGroupingMatch) {
if (sepType == -1) {
// No such group (prevGroup before first shift).
return true;
} else if (sepType == 0) {
// First group.
if (isPrimary) {
// No grouping separators is OK.
return true;
} else {
return count != 0 && count <= grouping2;
}
} else if (sepType == 1) {
// Middle group.
if (isPrimary) {
return count == grouping1;
} else {
return count == grouping2;
}
} else {
assert sepType == 2;
// After the decimal separator.
return true;
}
} else {
if (sepType == 1) {
// #11230: don't accept middle groups with only 1 digit.
return count != 1;
} else {
return true;
}
}
}
@Override

View file

@ -147,8 +147,6 @@ public class NumberParserImpl {
boolean isStrict = properties.getParseMode() == ParseMode.STRICT;
Grouper grouper = Grouper.forProperties(properties);
int parseFlags = 0;
// Fraction grouping is disabled by default because it has never been supported in DecimalFormat
parseFlags |= ParsingUtils.PARSE_FLAG_FRACTION_GROUPING_DISABLED;
if (!properties.getParseCaseSensitive()) {
parseFlags |= ParsingUtils.PARSE_FLAG_IGNORE_CASE;
}

View file

@ -16,7 +16,7 @@ public class ParsingUtils {
public static final int PARSE_FLAG_STRICT_GROUPING_SIZE = 0x0008;
public static final int PARSE_FLAG_INTEGER_ONLY = 0x0010;
public static final int PARSE_FLAG_GROUPING_DISABLED = 0x0020;
public static final int PARSE_FLAG_FRACTION_GROUPING_DISABLED = 0x0040;
// public static final int PARSE_FLAG_FRACTION_GROUPING_ENABLED = 0x0040; // see #10794
public static final int PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES = 0x0080;
public static final int PARSE_FLAG_USE_FULL_AFFIXES = 0x0100;
public static final int PARSE_FLAG_EXACT_AFFIX = 0x0200;

View file

@ -760,8 +760,10 @@ parse output breaks
// JDK stops parsing at the spaces. JDK doesn't see space as a grouping separator
(34 25E-1) -342.5 K
(34,,25E-1) -342.5
// H doesn't allow trailing separators before E but C and P do
(34,,25,E-1) -342.5 CHJP
// Trailing grouping separators are not OK.
// H fails; C/J/P stop at the offending separator.
(34,,25,E-1) fail CJKP
(34,,25,E-1) -3425 HK
(34 25 E-1) -342.5 HK
(34,,25 E-1) -342.5 HK
// Spaces are not allowed after exponent symbol
@ -999,7 +1001,7 @@ parse output breaks
१३ 13
१३.३१‍ 13.31
123'456 123456
524'1.3 5241.3
524'11.3 52411.3
३'११‍ 311
test parse with European-style comma/period
@ -1442,8 +1444,8 @@ NaN NaN K
1E2147483646 1E+2147483646 HJK
1E-2147483649 0
1E-2147483648 0
// H, K, C and P return zero here
1E-2147483647 1E-2147483647 CHJKP
// H and K return zero here
1E-2147483647 1E-2147483647 HJK
1E-2147483646 1E-2147483646 HJK
test format push limits
@ -1476,26 +1478,43 @@ pattern lenient parse output breaks
#,##0 1 9 99 999 K
#,##0 1 9 999 9999 K
#,##0 1 9 9 9 9 H
#,##0 1 ,9 fail HK
#,##0 1 ,9 9
#,##0 1 99,.0 99
#,##0 1 9 9. 9 H
#,##0 1 9 99. 999 K
0 1 9 9 9
0 1 9 99 9
0 1 9 999 9
0 1 9 9 9 9
0 1 ,9 fail
0 1 99,.0 99
0 1 9 9. 9
0 1 9 99. 9
#,##0 0 9 9 fail K
#,##0 0 9 99 fail K
#,##0 0 9 999 9999 K
#,##0 0 9 9 9 fail K
#,##0 0 ,9 fail K
#,##0 0 99,.0 fail K
#,##0 0 9 9. fail K
#,##0 0 9 99. fail K
0 0 9 9 9
0 0 9 99 9
0 0 9 999 9
0 0 9 9 9 9
0 0 ,9 fail
0 0 99,.0 99
0 0 9 9. 9
0 0 9 99. 9
test more strict grouping parse
set locale en
set pattern #,##,##0
begin
lenient parse output breaks
1 1,23,, 123
0 9999, 9999
0 1,23,, fail K
test parse ignorables
set locale ar

View file

@ -2875,6 +2875,8 @@ public class NumberFormatTest extends TestFmwk {
"1,2", // wrong number of digits after group separator
",.02", // leading group separator before decimal
"1,.02", // group separator before decimal
",0", // leading group separator before a single digit
",1", // leading group separator before a single digit
"1,45", // wrong number of digits in primary group
"1,45 that", // wrong number of digits in primary group
"1,45.34", // wrong number of digits in primary group
@ -2884,8 +2886,6 @@ public class NumberFormatTest extends TestFmwk {
};
// Fail both lenient and strict:
String[] failBoth = {
",0", // leading group separator before a single digit
",1", // leading group separator before a single digit
};
DecimalFormat nf = (DecimalFormat) NumberFormat.getInstance(Locale.ENGLISH);

View file

@ -54,6 +54,13 @@ public class NumberParserTest {
{ 7, "51,423", "#,##,##0", 6, 51423. },
{ 7, " 51,423", "#,##,##0", 7, 51423. },
{ 7, "51,423 ", "#,##,##0", 6, 51423. },
{ 7, "51,423,", "#,##,##0", 6, 51423. },
{ 7, "51,423,,", "#,##,##0", 6, 51423. },
{ 7, "51,423.5", "#,##,##0", 8, 51423.5 },
{ 7, "51,423.5,", "#,##,##0", 8, 51423.5 },
{ 7, "51,423.5,,", "#,##,##0", 8, 51423.5 },
{ 7, "51,423.5.", "#,##,##0", 8, 51423.5 },
{ 7, "51,423.5..", "#,##,##0", 8, 51423.5 },
{ 7, "𝟱𝟭,𝟰𝟮𝟯", "#,##,##0", 11, 51423. },
{ 7, "𝟳,𝟴𝟵,𝟱𝟭,𝟰𝟮𝟯", "#,##,##0", 19, 78951423. },
{ 7, "𝟳𝟴,𝟵𝟱𝟭.𝟰𝟮𝟯", "#,##,##0", 18, 78951.423 },
@ -61,6 +68,19 @@ public class NumberParserTest {
{ 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟬𝟬", "#,##,##0", 18, 78000. },
{ 7, "𝟳𝟴,𝟬𝟬𝟬.𝟬𝟮𝟯", "#,##,##0", 18, 78000.023 },
{ 7, "𝟳𝟴.𝟬𝟬𝟬.𝟬𝟮𝟯", "#,##,##0", 11, 78. },
{ 7, "1,", "#,##,##0", 1, 1. },
{ 7, "1,,", "#,##,##0", 1, 1. },
{ 7, "1.,", "#,##,##0", 2, 1. },
{ 3, "1,.", "#,##,##0", 3, 1. },
{ 7, "1..", "#,##,##0", 2, 1. },
{ 3, ",1", "#,##,##0", 2, 1. },
{ 3, "1,1", "#,##,##0", 1, 1. },
{ 3, "1,1,", "#,##,##0", 1, 1. },
{ 3, "1,1,,", "#,##,##0", 1, 1. },
{ 3, "1,1,1", "#,##,##0", 1, 1. },
{ 3, "1,1,1,", "#,##,##0", 1, 1. },
{ 3, "1,1,1,1", "#,##,##0", 1, 1. },
{ 3, "1,1,1,,", "#,##,##0", 1, 1. },
{ 3, "-51423", "0", 6, -51423. },
{ 3, "51423-", "0", 5, 51423. }, // plus and minus sign by default do NOT match after
{ 3, "+51423", "0", 6, 51423. },