ICU-13060 Documentation and parsing whitespace/bidi tweaks.

X-SVN-Rev: 39951
2025-04-08 06:53:45 +00:00 · 2017-03-29 02:31:49 +00:00 · 2017-03-29 02:31:49 +00:00 · 08dcc7e82e
commit 08dcc7e82e
parent dd47bbc596
4 changed files with 127 additions and 23 deletions
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/number/Parse.java
@ -11,6 +11,7 @@ import java.util.Iterator;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;

+import com.ibm.icu.impl.PatternProps;
 import com.ibm.icu.impl.StandardPlural;
 import com.ibm.icu.impl.TextTrieMap;
 import com.ibm.icu.impl.number.formatters.BigDecimalMultiplier;
@ -1129,7 +1130,7 @@ public class Parse {
            }
            acceptGrouping(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
            if (state.length > 0 && mode == ParseMode.FAST) break;
-            acceptBidi(cp, StateName.AFTER_INTEGER_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
            if (state.length > 0 && mode == ParseMode.FAST) break;
            acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
            if (state.length > 0 && mode == ParseMode.FAST) break;
@ -1155,7 +1156,7 @@ public class Parse {
            // We encountered a decimal point
            acceptFractionDigit(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
            if (state.length > 0 && mode == ParseMode.FAST) break;
-            acceptBidi(cp, StateName.AFTER_FRACTION_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX, state, item);
            if (state.length > 0 && mode == ParseMode.FAST) break;
            acceptPadding(cp, StateName.BEFORE_SUFFIX, state, item);
            if (state.length > 0 && mode == ParseMode.FAST) break;
@ -1184,7 +1185,7 @@ public class Parse {
            break;

          case AFTER_EXPONENT_DIGIT:
-            acceptBidi(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
+            acceptBidi(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
            acceptPadding(cp, StateName.BEFORE_SUFFIX_SEEN_EXPONENT, state, item);
            acceptExponentDigit(cp, StateName.AFTER_EXPONENT_DIGIT, state, item);
            if (mode == ParseMode.LENIENT || mode == ParseMode.STRICT) {
@ -1257,17 +1258,17 @@ public class Parse {
            break;

          case INSIDE_STRING:
-            acceptStringOffset(cp, state, item);
+            long added0 = acceptStringOffset(cp, state, item);
            // Accept arbitrary bidi in the middle of strings.
-            if (state.length == 0 && UNISET_BIDI.contains(cp)) {
+            if (added0 == 0L && UNISET_BIDI.contains(cp)) {
              state.getNext().copyFrom(item, item.name, cp);
            }
            break;

          case INSIDE_AFFIX_PATTERN:
-            long added = acceptAffixPatternOffset(cp, state, item);
-            // Accept arbitrary bidi and whitespace (if lenient) in the middle of affixes.
-            if (added == 0L && isIgnorable(cp, state)) {
+            long added1 = acceptAffixPatternOffset(cp, state, item);
+            // Accept arbitrary bidi and whitespace (if lenient) in the middle of affix patterns.
+            if (added1 == 0L && isIgnorable(cp, state)) {
              state.getNext().copyFrom(item, item.name, cp);
            }
            break;
@ -1758,8 +1759,8 @@ public class Parse {
    if (!next.sawSuffix && holder.s.isEmpty()) next.score += 5;
  }

-  private static void acceptStringOffset(int cp, ParserState state, StateItem item) {
-    acceptString(
+  private static long acceptStringOffset(int cp, ParserState state, StateItem item) {
+    return acceptString(
        cp,
        item.returnTo1,
        item.returnTo2,
@ -1813,20 +1814,20 @@ public class Parse {
      referenceCp = Character.codePointAt(str, offset);
      count = Character.charCount(referenceCp);
      equals = codePointEquals(cp, referenceCp, state);
-      if (!UNISET_BIDI.contains(cp)) break;
+      if (!UNISET_BIDI.contains(referenceCp)) break;
    }

    if (equals) {
      // Matches first code point of the string
      StateItem next = state.getNext().copyFrom(item, null, cp);

-      // Skip over ignorable code points in the middle of the string.
+      // Skip over bidi code points in the middle or end of the string.
      // They will be accepted in the main loop.
      offset += count;
      for (; offset < str.length(); offset += count) {
        referenceCp = Character.codePointAt(str, offset);
        count = Character.charCount(referenceCp);
-        if (!UNISET_BIDI.contains(cp)) break;
+        if (!UNISET_BIDI.contains(referenceCp)) break;
      }

      if (offset < str.length()) {
@ -1879,7 +1880,7 @@ public class Parse {
      tag = AffixPatternUtils.nextToken(tag, str);
      typeOrCp = AffixPatternUtils.getTypeOrCp(tag);
      hasNext = AffixPatternUtils.hasNext(tag, str);
-      if (typeOrCp < 0 || !isIgnorable(typeOrCp, state)) break;
+      if (typeOrCp < 0 || !isPatternIgnorable(typeOrCp, state)) break;
    }

    // Convert from the returned tag to a code point, string, or currency to check
@ -1926,18 +1927,16 @@ public class Parse {
    while (hasNext) {
      long futureTag = AffixPatternUtils.nextToken(tag, str);
      int futureTypeOrCp = AffixPatternUtils.getTypeOrCp(futureTag);
-      if (futureTypeOrCp < 0 || !isIgnorable(futureTypeOrCp, state)) break;
+      if (futureTypeOrCp < 0 || !isPatternIgnorable(futureTypeOrCp, state)) break;
      tag = futureTag;
      typeOrCp = futureTypeOrCp;
      hasNext = AffixPatternUtils.hasNext(tag, str);
    }

    long added = 0L;
-    if (resolvedCp >= 0) {
+    if (resolvedCp >= 0 && codePointEquals(cp, resolvedCp, state)) {
      // Code point
-      if (!codePointEquals(cp, resolvedCp, state)) return 0L;
      StateItem next = state.getNext().copyFrom(item, null, cp);
-
      if (hasNext) {
        // Additional tokens in affix string.
        next.name = StateName.INSIDE_AFFIX_PATTERN;
@ -1948,6 +1947,7 @@ public class Parse {
        next.trailingCount = 0;
        next.returnTo1 = null;
      }
+      next.score += 1; // reward for consuming code point from pattern
      added |= 1L << state.lastInsertedIndex();
    }
    if (resolvedMinusSign) {
@ -2180,11 +2180,26 @@ public class Parse {
   *
   * @param cp The code point to test. Returns false if cp is negative.
   * @param state The current {@link ParserState}, used for determining strict mode.
-   * @return true if cp is bidi or whitespace in lenient mode; false otherwise.
+   * @return true if cp is ignorable; false otherwise.
   */
  private static boolean isIgnorable(int cp, ParserState state) {
    if (cp < 0) return false;
    if (UNISET_BIDI.contains(cp)) return true;
    return state.mode == ParseMode.LENIENT && UNISET_WHITESPACE.contains(cp);
  }
+
+  /**
+   * Checks whether the given code point is "ignorable" in pattern syntax. This includes all
+   * characters that are normally ignorable in {@link #isIgnorable} plus characters having the
+   * Pattern_White_Space property.
+   *
+   * @param cp The code point to test. Returns false if cp is negative.
+   * @param state The current {@link ParserState}, used for determining strict mode.
+   * @return true if cp is ignorable; false otherwise.
+   */
+  private static boolean isPatternIgnorable(int cp, ParserState state) {
+    if (cp < 0) return false;
+    if (state.mode == ParseMode.LENIENT && PatternProps.isWhiteSpace(cp)) return true;
+    return isIgnorable(cp, state);
+  }
 }
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DecimalFormat.java
@ -198,6 +198,15 @@ import com.ibm.icu.util.ULocale.Category;
 * and leaves the parse position unchanged. The convenience method {@link #parse(String)} indicates
 * parse failure by throwing a {@link java.text.ParseException}.
 *
+ * <p>Under the hood, a state table parsing engine is used. To debug a parsing failure during
+ * development, use the following pattern to print details about the state table transitions:
+ *
+ * <pre>
+ * com.ibm.icu.impl.number.Parse.DEBUGGING = true;
+ * df.parse("123.45", ppos);
+ * com.ibm.icu.impl.number.Parse.DEBUGGING = false;
+ * </pre>
+ *
 * <h3>Thread Safety and Best Practices</h3>
 *
 * <p>Starting with ICU 59, instance of DecimalFormat are thread-safe.
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/numberformattestspecification.txt
@ -1439,3 +1439,62 @@ parse	output	breaks
 // K and J return null
 9 999	9999	JK

+test parse ignorables
+set locale ar
+// Note: Prefixes contain RLMs, as do some of the test cases.
+set pattern x a‎b0c df 
+set negativePrefix y g‎h
+set negativeSuffix i jk 
+begin
+parse	output	breaks
+x a‎b56c df 	56
+x  a‎b56c df 	56	K
+x ab56c df 	56	K
+x ab56c df	56	JK
+x ab56c df  	56	K
+x ab56	56	JK
+x a b56	56	JK
+56cdf	56	JK
+56c df	56	JK
+56cd f	56	JK
+56c‎d‎f	56	JK
+56cdf 	56	JK
+56c d‎f 	56	JK
+56‎c df 	56	JK
+y g‎h56i jk 	-56
+y  g‎h56i jk 	-56	KS
+y gh56i jk 	-56	K
+y gh56i jk	-56	JK
+y gh56i jk  	-56	K
+y gh56	-56	JK
+y g h56	-56	JKS
+// S stops parsing at the 'i' for these and returns -56
+56ijk	-56	JK
+56i jk	-56	JK
+56ij k	-56	JK
+56i‎j‎k	-56	JK
+56ijk 	-56	JK
+56i j‎k 	-56	JK
+56‎i jk 	-56	JK
+// S gets 56 (accepts ' ' gs grouping); J and K get null
+5 6	5	JKS
+5‎6	5	JK
+
+test parse spaces in grouping
+// This test gives the ideal behavior of these cases, which
+// none of the implementations currently support.
+set locale en
+set pattern #,##0
+begin
+parse	output	breaks
+// J and S get "12" here
+1 2	1	JS
+1 23	1	JS
+// K gets 1 here; doesn't pick up the grouping separator
+1 234	1234	K
+
+
+
+
+
+
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@ -4850,13 +4850,16 @@ public class NumberFormatTest extends TestFmwk {
    @Test
    public void Test11839() {
        DecimalFormatSymbols dfs = new DecimalFormatSymbols(ULocale.ENGLISH);
-        dfs.setMinusSign('∸');
-        dfs.setPlusSign('∔'); //  ∔  U+2214 DOT PLUS
+        dfs.setMinusSignString("a∸");
+        dfs.setPlusSignString("b∔"); //  ∔  U+2214 DOT PLUS
        DecimalFormat df = new DecimalFormat("0.00+;0.00-", dfs);
        String result = df.format(-1.234);
-        assertEquals("Locale-specific minus sign should be used", "1.23∸", result);
+        assertEquals("Locale-specific minus sign should be used", "1.23a∸", result);
        result = df.format(1.234);
-        assertEquals("Locale-specific plus sign should be used", "1.23∔", result);
+        assertEquals("Locale-specific plus sign should be used", "1.23b∔", result);
+        // Test round-trip with parse
+        expect2(df, -456, "456.00a∸");
+        expect2(df, 456, "456.00b∔");
    }

    @Test
@ -5171,6 +5174,24 @@ public class NumberFormatTest extends TestFmwk {
        }
    }

+    @Test
+    public void testParseIgnorables() {
+        // Also see the test case "test parse ignorables" in numberformattestspecification.txt
+        DecimalFormatSymbols dfs = DecimalFormatSymbols.getInstance();
+        dfs.setPercentString("\u200E%\u200E");
+        DecimalFormat df = new DecimalFormat("0%;-0a", dfs);
+        ParsePosition ppos = new ParsePosition(0);
+        Number result = df.parse("42\u200E%\u200E ", ppos);
+        assertEquals("Should parse as percentage", new BigDecimal("0.42"), result);
+        // NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
+        assertEquals("Should not consume the trailing bidi even though it is in the symbol", 4, ppos.getIndex());
+        ppos.setIndex(0);
+        result = df.parse("-42a\u200E ", ppos);
+        assertEquals("Should parse as percent", new BigDecimal("-0.42"), result);
+        // NOTE: This behavior is specified only in 59. It is probably okay if it changes in the future.
+        assertEquals("Should not consume the trailing bidi or whitespace", 4, ppos.getIndex());
+    }
+
    @Test
    public void testSignificantDigitsMode() {
        String[][] allExpected = {