ICU-13060 Documentation and parsing whitespace/bidi tweaks.

X-SVN-Rev: 39951
This commit is contained in:
Shane Carr 2017-03-29 02:31:49 +00:00
parent dd47bbc596
commit 08dcc7e82e
4 changed files with 127 additions and 23 deletions

View file

@ -11,6 +11,7 @@ import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import com.ibm.icu.impl.PatternProps;
import com.ibm.icu.impl.StandardPlural;
import com.ibm.icu.impl.TextTrieMap;
import com.ibm.icu.impl.number.formatters.BigDecimalMultiplier;
@ -1129,7 +1130,7 @@ public class Parse {
}
acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptBidi(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
@ -1155,7 +1156,7 @@ public class Parse {
// We encountered a decimal point
acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptBidi(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
if (state.length > 0 && mode == ParseMode.FAST) break;
@ -1184,7 +1185,7 @@ public class Parse {
break;
case AFTER_EXPONENT_DIGIT:
acceptBidi(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
@ -1257,17 +1258,17 @@ public class Parse {
break;
case INSIDE_STRING:
acceptStringOffset(cp, state, item);
long added0 = acceptStringOffset(cp, state, item);
// Accept arbitrary bidi in the middle of strings.
if (state.length == 0 && UNISET_BIDI.contains(cp)) {
if (added0 == 0L && UNISET_BIDI.contains(cp)) {
state.getNext().copyFrom(item, item.name, cp);
}
break;
case INSIDE_AFFIX_PATTERN:
long added = acceptAffixPatternOffset(cp, state, item);
// Accept arbitrary bidi and whitespace (if lenient) in the middle of affixes.
if (added == 0L && isIgnorable(cp, state)) {
long added1 = acceptAffixPatternOffset(cp, state, item);
// Accept arbitrary bidi and whitespace (if lenient) in the middle of affix patterns.
if (added1 == 0L && isIgnorable(cp, state)) {
state.getNext().copyFrom(item, item.name, cp);
}
break;
@ -1758,8 +1759,8 @@ public class Parse {
if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5;
}
private static void acceptStringOffset(int cp, ParserState state, StateItem item) {
acceptString(
private static long acceptStringOffset(int cp, ParserState state, StateItem item) {
return acceptString(
cp,
item.returnTo1,
item.returnTo2,
@ -1813,20 +1814,20 @@ public class Parse {
referenceCp = Character.codePointAt(str, offset);
count = Character.charCount(referenceCp);
equals = codePointEquals(cp, referenceCp, state);
if (!UNISET_BIDI.contains(cp)) break;
if (!UNISET_BIDI.contains(referenceCp)) break;
}
if (equals) {
// Matches first code point of the string
StateItem next = state.getNext().copyFrom(item, null, cp);
// Skip over ignorable code points in the middle of the string.
// Skip over bidi code points in the middle or end of the string.
// They will be accepted in the main loop.
offset += count;
for (; offset < str.length(); offset += count) {
referenceCp = Character.codePointAt(str, offset);
count = Character.charCount(referenceCp);
if (!UNISET_BIDI.contains(cp)) break;
if (!UNISET_BIDI.contains(referenceCp)) break;
}
if (offset < str.length()) {
@ -1879,7 +1880,7 @@ public class Parse {
tag = AffixPatternUtils.nextToken(tag, str);
typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
hasNext = AffixPatternUtils.hasNext(tag, str);
if (typeOrCp < 0 || !isIgnorable(typeOrCp, state)) break;
if (typeOrCp < 0 || !isPatternIgnorable(typeOrCp, state)) break;
}
// Convert from the returned tag to a code point, string, or currency to check
@ -1926,18 +1927,16 @@ public class Parse {
while (hasNext) {
long futureTag = AffixPatternUtils.nextToken(tag, str);
int futureTypeOrCp = AffixPatternUtils.getTypeOrCp(futureTag);
if (futureTypeOrCp < 0 || !isIgnorable(futureTypeOrCp, state)) break;
if (futureTypeOrCp < 0 || !isPatternIgnorable(futureTypeOrCp, state)) break;
tag = futureTag;
typeOrCp = futureTypeOrCp;
hasNext = AffixPatternUtils.hasNext(tag, str);
}
long added = 0L;
if (resolvedCp >= 0) {
if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) {
// Code point
if (!codePointEquals(cp, resolvedCp, state)) return 0L;
StateItem next = state.getNext().copyFrom(item, null, cp);
if (hasNext) {
// Additional tokens in affix string.
next.name = StateName.INSIDE_AFFIX_PATTERN;
@ -1948,6 +1947,7 @@ public class Parse {
next.trailingCount = 0;
next.returnTo1 = null;
}
next.score += 1; // reward for consuming code point from pattern
added |= 1L << state.lastInsertedIndex();
}
if (resolvedMinusSign) {
@ -2180,11 +2180,26 @@ public class Parse {
*
* @param cp The code point to test. Returns false if cp is negative.
* @param state The current {@link ParserState}, used for determining strict mode.
* @return true if cp is bidi or whitespace in lenient mode; false otherwise.
* @return true if cp is ignorable; false otherwise.
*/
private static boolean isIgnorable(int cp, ParserState state) {
if (cp < 0) return false;
if (UNISET_BIDI.contains(cp)) return true;
return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp);
}
/**
* Checks whether the given code point is "ignorable" in pattern syntax. This includes all
* characters that are normally ignorable in {@link #isIgnorable} plus characters having the
* Pattern_White_Space property.
*
* @param cp The code point to test. Returns false if cp is negative.
* @param state The current {@link ParserState}, used for determining strict mode.
* @return true if cp is ignorable; false otherwise.
*/
private static boolean isPatternIgnorable(int cp, ParserState state) {
if (cp < 0) return false;
if (state.mode == ParseMode.LENIENT && PatternProps.isWhiteSpace(cp)) return true;
return isIgnorable(cp, state);
}
}

View file

@ -198,6 +198,15 @@ import com.ibm.icu.util.ULocale.Category;
* and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates
* parse failure by throwing a {@link java.text.ParseException}.
*
* <p>Under the hood, a state table parsing engine is used. To debug a parsing failure during
* development, use the following pattern to print details about the state table transitions:
*
* <pre>
* com.ibm.icu.impl.number.Parse.DEBUGGING = true;
* df.parse("123.45", ppos);
* com.ibm.icu.impl.number.Parse.DEBUGGING = false;
* </pre>
*
* <h3>Thread Safety and Best Practices</h3>
*
* <p>Starting with ICU 59, instance of DecimalFormat are thread-safe.

View file

@ -1439,3 +1439,62 @@ parse output breaks
// K and J return null
9 999 9999 JK
test parse ignorables
set locale ar
// Note: Prefixes contain RLMs, as do some of the test cases.
set pattern x ab0c df
set negativePrefix y gh
set negativeSuffix i jk
begin
parse output breaks
x ab56c df 56
x ab56c df 56 K
x ab56c df 56 K
x ab56c df 56 JK
x ab56c df 56 K
x ab56 56 JK
x a b56 56 JK
56cdf 56 JK
56c df 56 JK
56cd f 56 JK
56cdf 56 JK
56cdf 56 JK
56c df 56 JK
56c df 56 JK
y gh56i jk -56
y gh56i jk -56 KS
y gh56i jk -56 K
y gh56i jk -56 JK
y gh56i jk -56 K
y gh56 -56 JK
y g h56 -56 JKS
// S stops parsing at the 'i' for these and returns -56
56ijk -56 JK
56i jk -56 JK
56ij k -56 JK
56ijk -56 JK
56ijk -56 JK
56i jk -56 JK
56i jk -56 JK
// S gets 56 (accepts ' ' gs grouping); J and K get null
5 6 5 JKS
56 5 JK
test parse spaces in grouping
// This test gives the ideal behavior of these cases, which
// none of the implementations currently support.
set locale en
set pattern #,##0
begin
parse output breaks
// J and S get "12" here
1 2 1 JS
1 23 1 JS
// K gets 1 here; doesn't pick up the grouping separator
1 234 1234 K

View file

@ -4850,13 +4850,16 @@ public class NumberFormatTest extends TestFmwk {
@Test
public void Test11839() {
DecimalFormatSymbols dfs = new DecimalFormatSymbols(ULocale.ENGLISH);
dfs.setMinusSign('∸');
dfs.setPlusSign('∔'); // U+2214 DOT PLUS
dfs.setMinusSignString("a∸");
dfs.setPlusSignString("b∔"); // U+2214 DOT PLUS
DecimalFormat df = new DecimalFormat("0.00+;0.00-", dfs);
String result = df.format(-1.234);
assertEquals("Locale-specific minus sign should be used", "1.23", result);
assertEquals("Locale-specific minus sign should be used", "1.23a", result);
result = df.format(1.234);
assertEquals("Locale-specific plus sign should be used", "1.23∔", result);
assertEquals("Locale-specific plus sign should be used", "1.23b∔", result);
// Test round-trip with parse
expect2(df, -456, "456.00a∸");
expect2(df, 456, "456.00b∔");
}
@Test
@ -5171,6 +5174,24 @@ public class NumberFormatTest extends TestFmwk {
}
}
@Test
public void testParseIgnorables() {
// Also see the test case "test parse ignorables" in numberformattestspecification.txt
DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance();
dfs.setPercentString("\u200E%\u200E");
DecimalFormat df = new DecimalFormat("0%;-0a", dfs);
ParsePosition ppos = new ParsePosition(0);
Number result = df.parse("42\u200E%\u200E ", ppos);
assertEquals("Should parse as percentage", new BigDecimal("0.42"), result);
// NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
assertEquals("Should not consume the trailing bidi even though it is in the symbol", 4, ppos.getIndex());
ppos.setIndex(0);
result = df.parse("-42a\u200E ", ppos);
assertEquals("Should parse as percent", new BigDecimal("-0.42"), result);
// NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
assertEquals("Should not consume the trailing bidi or whitespace", 4, ppos.getIndex());
}
@Test
public void testSignificantDigitsMode() {
String[][] allExpected = {