ICU-2154 standardize whitespace handling by date/number format

X-SVN-Rev: 11456
2025-04-20 20:19:32 +00:00 · 2003-04-04 19:20:52 +00:00 · 2003-04-04 19:20:52 +00:00 · 52ac97f86e
commit 52ac97f86e
parent f6815a28c0
4 changed files with 234 additions and 107 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/format/DateFormatTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/format/DateFormatTest.java
@ -4,8 +4,8 @@
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/format/DateFormatTest.java,v $ 
- * $Date: 2003/03/13 20:27:47 $ 
- * $Revision: 1.13 $
+ * $Date: 2003/04/04 19:20:51 $ 
+ * $Revision: 1.14 $
 *
 *****************************************************************************************
 */
@ -876,6 +876,23 @@ public class DateFormatTest extends com.ibm.icu.dev.test.TestFmwk {
        expectParse(DATA, new Locale("en"));
    }

+    /**
+     * Test handling of white space.
+     */
+    public void TestWhiteSpaceParsing() {
+        String DATA[] = {
+            "yyyy MM dd",
+
+            // pattern, input, expected parse or null if expect parse failure
+
+            // Pattern space run should parse input text space run
+            "MM   d yy",   " 04 01 03",    "2003 04 01",
+            null,          " 04  01   03 ", "2003 04 01",
+        };
+
+        expectParse(DATA, new Locale("en"));
+    }
+
    public void TestCoverage() {
        Date now = new Date();
        Calendar cal = new GregorianCalendar();
--- a/icu4j/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/format/NumberFormatTest.java
@ -4,8 +4,8 @@
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/format/NumberFormatTest.java,v $ 
- * $Date: 2003/02/25 23:39:43 $ 
- * $Revision: 1.10 $
+ * $Date: 2003/04/04 19:20:52 $ 
+ * $Revision: 1.11 $
 *
 *****************************************************************************************
 */
@ -842,6 +842,14 @@ public class NumberFormatTest extends com.ibm.icu.dev.test.TestFmwk {
        }
    }

+    public void TestWhiteSpaceParsing() {
+        DecimalFormatSymbols US = new DecimalFormatSymbols(Locale.US);
+        DecimalFormat fmt = new DecimalFormat("a  b#0c  ", US);
+        int n = 1234;
+        expect(fmt, "a b1234c ", n);
+        expect(fmt, "a   b1234c   ", n);
+    }
+    
    public void expectPad(DecimalFormat fmt, String pat, int pos) {
        expectPad(fmt, pat, pos, 0, (char)0);
    }
--- a/icu4j/src/com/ibm/icu/text/DecimalFormat.java
+++ b/icu4j/src/com/ibm/icu/text/DecimalFormat.java
@ -5,14 +5,16 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/DecimalFormat.java,v $ 
- * $Date: 2003/02/21 01:49:21 $ 
- * $Revision: 1.21 $
+ * $Date: 2003/04/04 19:20:52 $ 
+ * $Revision: 1.22 $
 *
 *****************************************************************************************
 */
 package com.ibm.icu.text;

 import com.ibm.icu.util.Currency;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.impl.UCharacterProperty;
 import java.text.ParsePosition;
 import java.text.FieldPosition;
 import java.math.BigInteger;
@ -1180,25 +1182,44 @@ public class DecimalFormat extends NumberFormat {
        int oldStart = parsePosition.getIndex();
        int backup;

-        // check for positivePrefix; take longest
-        boolean gotPositive = text.regionMatches(position,positivePrefix,0,
-                                                 positivePrefix.length());
-        boolean gotNegative = text.regionMatches(position,negativePrefix,0,
-                                                 negativePrefix.length());
-        if (gotPositive && gotNegative) {
-            if (positivePrefix.length() > negativePrefix.length())
-                gotNegative = false;
-            else if (positivePrefix.length() < negativePrefix.length())
-                gotPositive = false;
+        // Match positive and negative prefixes; prefer longest match.
+        int posMatch = compareAffix(positivePrefix, text, position);
+        int negMatch = compareAffix(negativePrefix, text, position);
+        if (posMatch >= 0 && negMatch >= 0) {
+            if (posMatch > negMatch) {
+                negMatch = -1;
+            } else if (negMatch > posMatch) {
+                posMatch = -1;
+            }  
        }
-        if (gotPositive) {
-            position += positivePrefix.length();
-        } else if (gotNegative) {
-            position += negativePrefix.length();
+        if (posMatch >= 0) {
+            position += posMatch;
+        } else if (negMatch >= 0) {
+            position += negMatch;
        } else {
            //PP:parsePosition.errorIndex = position;
            return false;
        }
+
+//        // check for positivePrefix; take longest
+//        boolean gotPositive = text.regionMatches(position,positivePrefix,0,
+//                                                 positivePrefix.length());
+//        boolean gotNegative = text.regionMatches(position,negativePrefix,0,
+//                                                 negativePrefix.length());
+//        if (gotPositive && gotNegative) {
+//            if (positivePrefix.length() > negativePrefix.length())
+//                gotNegative = false;
+//            else if (positivePrefix.length() < negativePrefix.length())
+//                gotPositive = false;
+//        }
+//        if (gotPositive) {
+//            position += positivePrefix.length();
+//        } else if (gotNegative) {
+//            position += negativePrefix.length();
+//        } else {
+//            //PP:parsePosition.errorIndex = position;
+//            return false;
+//        }
        // process digits or Inf, find decimal position
        status[STATUS_INFINITE] = false;
        if (!isExponent && text.regionMatches(position,symbols.getInfinity(),0,
@ -1371,32 +1392,57 @@ public class DecimalFormat extends NumberFormat {
            }
        }

-        // check for positiveSuffix
-        if (gotPositive)
-            gotPositive = text.regionMatches(position,positiveSuffix,0,
-                                             positiveSuffix.length());
-        if (gotNegative)
-            gotNegative = text.regionMatches(position,negativeSuffix,0,
-                                             negativeSuffix.length());
-
-        // if both match, take longest
-        if (gotPositive && gotNegative) {
-            if (positiveSuffix.length() > negativeSuffix.length())
-                gotNegative = false;
-            else if (positiveSuffix.length() < negativeSuffix.length())
-                gotPositive = false;
+        // Match positive and negative suffixes; prefer longest match.
+        if (posMatch >= 0) {
+            posMatch = compareAffix(positiveSuffix, text, position);
+        }
+        if (negMatch >= 0) {
+            negMatch = compareAffix(negativeSuffix, text, position);
+        }
+        if (posMatch >= 0 && negMatch >= 0) {
+            if (posMatch > negMatch) {
+                negMatch = -1;
+            } else if (negMatch > posMatch) {
+                posMatch = -1;
+            }  
        }

-        // fail if neither or both
-        if (gotPositive == gotNegative) {
+        // Fail if neither or both
+        if ((posMatch >= 0) == (negMatch >= 0)) {
            //PP:parsePosition.errorIndex = position;
            return false;
        }

-        parsePosition.setIndex(position +
-            (gotPositive ? positiveSuffix.length() : negativeSuffix.length())); // mark success!
+        parsePosition.setIndex(position + (posMatch>=0 ? posMatch : negMatch));

-        status[STATUS_POSITIVE] = gotPositive;
+        status[STATUS_POSITIVE] = (posMatch >= 0);
+
+//        // check for positiveSuffix
+//        if (gotPositive)
+//            gotPositive = text.regionMatches(position,positiveSuffix,0,
+//                                             positiveSuffix.length());
+//        if (gotNegative)
+//            gotNegative = text.regionMatches(position,negativeSuffix,0,
+//                                             negativeSuffix.length());
+//
+//        // if both match, take longest
+//        if (gotPositive && gotNegative) {
+//            if (positiveSuffix.length() > negativeSuffix.length())
+//                gotNegative = false;
+//            else if (positiveSuffix.length() < negativeSuffix.length())
+//                gotPositive = false;
+//        }
+//
+//        // fail if neither or both
+//        if (gotPositive == gotNegative) {
+//            //PP:parsePosition.errorIndex = position;
+//            return false;
+//        }
+//
+//        parsePosition.setIndex(position +
+//            (gotPositive ? positiveSuffix.length() : negativeSuffix.length())); // mark success!
+//
+//        status[STATUS_POSITIVE] = gotPositive;
        if (parsePosition.getIndex() == oldStart) {
            //PP:parsePosition.errorIndex = position;
            return false;
@ -1404,6 +1450,58 @@ public class DecimalFormat extends NumberFormat {
        return true;
    }

+    /**
+     * Return the length matched by the given affix, or -1 if none.
+     * Runs of white space in the affix, match runs of white space in
+     * the input.  Pattern white space and input white space are
+     * determined differently; see code.
+     * @param affix pattern string, taken as a literal
+     * @param input input text
+     * @param pos offset into input at which to begin matching
+     * @return length of input that matches, or -1 if match failure
+     */
+    private int compareAffix(String affix, String input, int pos) {
+        int start = pos;
+        for (int i=0; i<affix.length(); ) {
+            int c = UTF16.charAt(affix, i);
+            int len = UTF16.getCharCount(c);
+            i += len;
+            if (UCharacterProperty.isRuleWhiteSpace(c)) {
+                // Advance over run in pattern
+                while (i < affix.length()) {
+                    c = UTF16.charAt(affix, i);
+                    if (!UCharacterProperty.isRuleWhiteSpace(c)) {
+                        break;
+                    }
+                    i += UTF16.getCharCount(c);
+                }
+                
+                // Advance over run in input text
+                int s = pos;
+                while (pos < input.length()) {
+                    c = UTF16.charAt(input, pos);
+                    if (!UCharacter.isUWhiteSpace(c)) {
+                        break;
+                    }
+                    pos += UTF16.getCharCount(c);
+                }
+
+                // Must see at least one white space char in input
+                if (pos == s) {
+                    return -1;
+                }
+            } else {
+                if (pos < input.length() &&
+                    UTF16.charAt(input, pos) == c) {
+                    pos += len;
+                } else {
+                    return -1;
+                }
+            }
+        }
+        return pos - start;
+    }
+
    /**
     * Returns the decimal format symbols, which is generally not changed
     * by the programmer or user.
--- a/icu4j/src/com/ibm/icu/text/SimpleDateFormat.java
+++ b/icu4j/src/com/ibm/icu/text/SimpleDateFormat.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/SimpleDateFormat.java,v $ 
- * $Date: 2003/03/13 20:28:29 $ 
- * $Revision: 1.19 $
+ * $Date: 2003/04/04 19:20:52 $ 
+ * $Revision: 1.20 $
 *
 *****************************************************************************************
 */
@ -18,6 +18,7 @@ import com.ibm.icu.util.Calendar;
 import com.ibm.icu.util.SimpleTimeZone;
 import com.ibm.icu.util.TimeZone;
 import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.impl.UCharacterProperty;

 import java.io.IOException;
 import java.io.ObjectInputStream;
@ -669,10 +670,10 @@ public class SimpleDateFormat extends DateFormat {
     * @see DateFormat
     * @stable ICU 2.0
     */
-    public void parse(String text, Calendar cal, ParsePosition pos)
+    public void parse(String text, Calendar cal, ParsePosition parsePos)
    {
-        int start = pos.getIndex();
-        int oldStart = start;
+        int pos = parsePos.getIndex();
+        int start = pos;
        boolean[] ambiguousYear = {false};
        int count = 0;

@ -684,52 +685,13 @@ public class SimpleDateFormat extends DateFormat {
        int abutPat = -1; // If >=0, we are in a run of abutting numeric fields
        int abutStart = 0;
        int abutPass = 0;
+        boolean inQuote = false;

        for (int i=0; i<pattern.length(); ++i) {
            char ch = pattern.charAt(i);

-            // Handle quoted strings.  Two consecutive quotes is a
-            // quote literal, inside or outside of quotes.
-            if (ch == '\'') {
-                abutPat = -1; // End of any abutting fields
-
-                // Match a quote literal '' outside of quotes
-                if ((i+1)<pattern.length() && pattern.charAt(i+1)==ch) {
-                    if (start==text.length() || text.charAt(start) != ch) {
-                        pos.setIndex(oldStart);
-                        pos.setErrorIndex(start);
-                        return;
-                    }
-                    ++start;
-                    ++i; // Skip over doubled quote
-                    continue;
-                }
-
-                // Match a quoted string, including any embedded ''
-                // quote literals.  Note that we allow an unclosed
-                // quote for backward compatibility.
-                while (++i<pattern.length()) {
-                    ch = pattern.charAt(i);
-                    if (ch == '\'') {
-                        if ((i+1)<pattern.length() && pattern.charAt(i+1)==ch) {
-                            ++i;
-                            // Fall through and match literal quote
-                        } else {
-                            break; // Closing quote seen
-                        }
-                    }
-                    if (start==text.length() || text.charAt(start) != ch) {
-                        pos.setIndex(oldStart);
-                        pos.setErrorIndex(start);
-                        return;
-                    }
-                    ++start;
-                }
-                continue;
-            }
-
            // Handle alphabetic field characters.
-            if (ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z') {
+            if (!inQuote && (ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z')) {
                int fieldPat = i;

                // Count the length of this field specifier
@ -766,7 +728,7 @@ public class SimpleDateFormat extends DateFormat {
                            // fields.
                            if (abutting) {
                                abutPat = fieldPat;
-                                abutStart = start;
+                                abutStart = pos;
                                abutPass = 0;
                            }
                        }
@ -790,20 +752,20 @@ public class SimpleDateFormat extends DateFormat {
                    if (fieldPat == abutPat) {
                        count -= abutPass++;
                        if (count == 0) {
-                            pos.setIndex(oldStart);
-                            pos.setErrorIndex(start);
+                            parsePos.setIndex(start);
+                            parsePos.setErrorIndex(pos);
                            return;
                        }
                    }

-                    start = subParse(text, start, ch, count,
-                                     true, false, ambiguousYear, cal);
+                    pos = subParse(text, pos, ch, count,
+                                   true, false, ambiguousYear, cal);

                    // If the parse fails anywhere in the run, back up to the
                    // start of the run and retry.
-                    if (start < 0) {
+                    if (pos < 0) {
                        i = abutPat - 1;
-                        start = abutStart;
+                        pos = abutStart;
                        continue;
                    }
                }
@ -811,28 +773,70 @@ public class SimpleDateFormat extends DateFormat {
                // Handle non-numeric fields and non-abutting numeric
                // fields.
                else {
-                    int k = start;
-                    start=subParse(text, start, ch, count,
+                    int s = pos;
+                    pos = subParse(text, pos, ch, count,
                                   false, true, ambiguousYear, cal);

-                    if (start < 0) {
-                        pos.setErrorIndex(k);
-                        pos.setIndex(oldStart);
+                    if (pos < 0) {
+                        parsePos.setErrorIndex(s);
+                        parsePos.setIndex(start);
                        return;
                    }
                }
            }

-            // Handle unquoted non-alphabetic characters.  These are
-            // treated as literals.
+            // Handle literal pattern characters.  These are any
+            // quoted characters and non-alphabetic unquoted
+            // characters.
            else {
+                
                abutPat = -1; // End of any abutting fields
-                if (start==text.length() || text.charAt(start) != ch) {
-                    pos.setIndex(oldStart);
-                    pos.setErrorIndex(start);
-                    return;
+
+                // Handle quotes.  Two consecutive quotes is a quote
+                // literal, inside or outside of quotes.  Otherwise a
+                // quote indicates entry or exit from a quoted region.
+                if (ch == '\'') {
+                    // Match a quote literal '' within OR outside of quotes
+                    if ((i+1)<pattern.length() && pattern.charAt(i+1)==ch) {
+                        ++i; // Skip over doubled quote
+                        // Fall through and treat quote as a literal
+                    } else {
+                        // Enter or exit quoted region
+                        inQuote = !inQuote;
+                        continue;
+                    }
                }
-                ++start;
+
+                // A run of white space in the pattern matches a run
+                // of white space in the input text.
+                if (UCharacterProperty.isRuleWhiteSpace(ch)) {
+                    // Advance over run in pattern
+                    while ((i+1)<pattern.length() &&
+                           UCharacterProperty.isRuleWhiteSpace(pattern.charAt(i+1))) {
+                        ++i;
+                    }
+                    
+                    // Advance over run in input text
+                    int s = pos;
+                    while (pos<text.length() &&
+                           UCharacter.isUWhiteSpace(text.charAt(pos))) {
+                        ++pos;
+                    }
+
+                    // Must see at least one white space char in input
+                    if (pos > s) {
+                        continue;
+                    }
+                } else if (pos<text.length() && text.charAt(pos)==ch) {
+                    // Match a literal
+                    ++pos;
+                    continue;
+                }
+
+                // We fall through to this point if the match fails
+                parsePos.setIndex(start);
+                parsePos.setErrorIndex(pos);
+                return;
            }
        }

@ -840,7 +844,7 @@ public class SimpleDateFormat extends DateFormat {
        // will fill in default values for missing fields when the time
        // is computed.

-        pos.setIndex(start);
+        parsePos.setIndex(pos);

        // This part is a problem:  When we call parsedDate.after, we compute the time.
        // Take the date April 3 2004 at 2:30 am.  When this is first set up, the year
@ -883,8 +887,8 @@ public class SimpleDateFormat extends DateFormat {
        // An IllegalArgumentException will be thrown by Calendar.getTime()
        // if any fields are out of range, e.g., MONTH == 17.
        catch (IllegalArgumentException e) {
-            pos.setErrorIndex(start);
-            pos.setIndex(oldStart);
+            parsePos.setErrorIndex(pos);
+            parsePos.setIndex(start);
        }
    }