ICU-1572 fix UnicodeSet.toPattern() round trip bugs

X-SVN-Rev: 7245
2025-04-21 12:40:02 +00:00 · 2001-12-01 01:33:41 +00:00 · 2001-12-01 01:33:41 +00:00 · 302bf822c7
commit 302bf822c7
parent 221d9f6880
5 changed files with 129 additions and 26 deletions
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@ -402,11 +402,13 @@ void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool useHexEscape)
    case BACKSLASH:
    case 123/*{*/:
    case 125/*}*/:
+    case SymbolTable::SYMBOL_REF:
+    case COLON:
        buf.append(BACKSLASH);
        break;
    default:
        // Escape whitespace
-        if (Unicode::isWhitespace(c)) {
+        if (u_isspace(c)) {
            buf.append(BACKSLASH);
        }
        break;
@ -435,8 +437,9 @@ UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
    if (pat.length() > 0) {
        int32_t i;
        int32_t backslashCount = 0;
-        for (i=0; i<pat.length(); ++i) {
-            UChar c = pat.charAt(i);
+        for (i=0; i<pat.length(); ) {
+            UChar32 c = pat.char32At(i);
+            i += UTF_CHAR_LENGTH(c);
            if (escapeUnprintable && Utility::isUnprintable(c)) {
                // If the unprintable character is preceded by an odd
                // number of backslashes, then it has been escaped.
@ -940,6 +943,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,

    const UChar32 NONE = (UChar32) -1;
    UChar32 lastChar = NONE; // This is either a char (0..10FFFF) or NONE
+    UBool isLastLiteral = FALSE; // TRUE if lastChar was a literal
    UChar lastOp = 0;

    /* This loop iterates over the characters in the pattern.  We start at
@ -1269,6 +1273,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
                _appendToPat(newPat, lastChar, FALSE);
            }
            lastChar = c;
+            isLastLiteral = isLiteral;
        }
    }

@ -1281,7 +1286,7 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
    // Treat a trailing '$' as indicating ETHER.  This code is only
    // executed if symbols == NULL; otherwise other code parses the
    // anchor.
-    if (lastChar == (UChar)SymbolTable::SYMBOL_REF) {
+    if (lastChar == (UChar)SymbolTable::SYMBOL_REF && !isLastLiteral) {
        rebuildPattern = TRUE;
        newPat.append(lastChar);
        add(TransliterationRule::ETHER);
--- a/icu4c/source/test/intltest/usettest.cpp
+++ b/icu4c/source/test/intltest/usettest.cpp
@ -44,10 +44,88 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
        CASE(7,TestPropertySet);
        CASE(8,TestClone);
        CASE(9,TestExhaustive);
+        CASE(10,TestToPattern);
        default: name = ""; break;
    }
 }

+/**
+ * Test that toPattern() round trips with syntax characters and
+ * whitespace.
+ */
+void UnicodeSetTest::TestToPattern() {
+    for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
+        if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
+            // check various combinations to make sure they all work.
+            if (i != 0 && !toPatternAux(i, i)) continue;
+            if (!toPatternAux(0, i)) continue;
+            if (!toPatternAux(i, 0xFFFF)) continue;
+        }
+    }
+    
+    UErrorCode ec = U_ZERO_ERROR;
+    UnicodeString spat = "[:nonspacing mark:]";
+    UnicodeSet s(spat, ec);
+    if (U_FAILURE(ec)) { errln("FAIL: UnicodeSet constructor"); return; }
+    UnicodeString tpat;
+    s.toPattern(tpat, TRUE);
+    UnicodeSet t(tpat, ec);
+    if (U_FAILURE(ec)) {
+        errln((UnicodeString)"FAIL: " + spat + ".toPattern() => " + tpat +
+              ": INVALID PATTERN");
+    } else {
+        if (s!=t) {
+            UnicodeString str;
+            t.toPattern(str, TRUE);
+            errln((UnicodeString)"FAIL: " + spat + ".toPattern().new UnicodeSet() => " +
+                  str);
+        }
+    }
+}
+    
+UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
+    // use Integer.toString because Utility.hex doesn't handle ints
+    UnicodeString pat = "";
+    // TODO do these in hex
+    //String source = "0x" + Integer.toString(start,16).toUpperCase();
+    //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
+    UnicodeString source = source + (int32_t)start;
+    if (start != end) source = source + ".." + (int32_t)end;
+    UnicodeSet testSet;
+    testSet.add(start, end);
+        
+    // What we want to make sure of is that a pattern generated
+    // by toPattern(), with or without escaped unprintables, can
+    // be passed back into the UnicodeSet constructor.
+    UnicodeString pat0; testSet.toPattern(pat0, TRUE);
+    if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
+    
+    //String pat1 = unescapeLeniently(pat0);
+    //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
+    
+    UnicodeString pat2; testSet.toPattern(pat2, FALSE);
+    if (!checkPat(source, testSet, pat2)) return FALSE;
+    
+    //String pat3 = unescapeLeniently(pat2);
+    //if (!checkPat(source + " (in code)", testSet, pat3)) return false;
+    
+    //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
+    logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
+    return TRUE;
+}
+    
+UBool UnicodeSetTest::checkPat(const UnicodeString& source,
+                               const UnicodeSet& testSet,
+                               const UnicodeString& pat) {
+    UErrorCode ec = U_ZERO_ERROR;
+    UnicodeSet testSet2(pat, ec);
+    if (testSet2 != testSet) {
+        errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
+        return FALSE;
+    }
+    return TRUE;
+}
+
 void
 UnicodeSetTest::TestPatterns(void) {
    UnicodeSet set;
--- a/icu4c/source/test/intltest/usettest.h
+++ b/icu4c/source/test/intltest/usettest.h
@ -27,6 +27,12 @@ class UnicodeSetTest: public IntlTest {
    
 private:

+    /**
+     * Test that toPattern() round trips with syntax characters and
+     * whitespace.
+     */
+    void TestToPattern();
+    
    void TestPatterns(void);
    void TestCategories(void);
    void TestAddRemove(void);
@ -52,6 +58,10 @@ private:

 private:

+    UBool toPatternAux(UChar32 start, UChar32 end);
+    
+    UBool checkPat(const UnicodeString& source, const UnicodeSet& testSet, const UnicodeString& pat);
+
    void _testComplement(int32_t a, UnicodeSet&, UnicodeSet&);

    void _testAdd(int32_t a, int32_t b, UnicodeSet&, UnicodeSet&, UnicodeSet&);
--- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
- * $Date: 2001/11/29 22:31:18 $
- * $Revision: 1.48 $
+ * $Date: 2001/12/01 01:31:18 $
+ * $Revision: 1.49 $
 *
 *****************************************************************************************
 */
@ -204,7 +204,7 @@ import com.ibm.util.Utility;
 * Unicode property
 * </table>
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.48 $ $Date: 2001/11/29 22:31:18 $
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.49 $ $Date: 2001/12/01 01:31:18 $
 */
 public class UnicodeSet extends UnicodeFilter {

@ -443,6 +443,8 @@ public class UnicodeSet extends UnicodeFilter {
        case '\\': //BACKSLASH:
        case '{':
        case '}':
+        case '$':
+        case ':':
            buf.append('\\');
            break;
        default:
@ -475,8 +477,9 @@ public class UnicodeSet extends UnicodeFilter {
        if (pat != null) {
            int i;
            int backslashCount = 0;
-            for (i=0; i<pat.length(); ++i) {
-                char c = pat.charAt(i);
+            for (i=0; i<pat.length(); ) {
+                int c = UTF16.charAt(pat, i);
+                i += UTF16.getCharCount(c);
                if (escapeUnprintable && Utility.isUnprintable(c)) {
                    // If the unprintable character is preceded by an odd
                    // number of backslashes, then it has been escaped.
@ -488,7 +491,7 @@ public class UnicodeSet extends UnicodeFilter {
                    Utility.escapeUnprintable(result, c);
                    backslashCount = 0;
                } else {
-                    result.append(c);
+                    UTF16.append(result, c);
                    if (c == '\\') {
                        ++backslashCount;
                    } else {
@ -706,7 +709,7 @@ public class UnicodeSet extends UnicodeFilter {
                int start = list[i++];
                int count = list[i++] - start;
                if (index < count) {
-                    return (char)(start + index);
+                    return start + index;
                }
                index -= count;
            }
@ -1114,6 +1117,7 @@ public class UnicodeSet extends UnicodeFilter {

        final int NONE = -1;
        int lastChar = NONE; // This is either a char (0..10FFFF) or -1
+        boolean isLastLiteral = false; // TRUE if lastChar was a literal
        char lastOp = 0;

        /* This loop iterates over the characters in the pattern.  We start at
@ -1343,7 +1347,7 @@ public class UnicodeSet extends UnicodeFilter {
                    if (lastOp != 0) {
                        throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
                    }
-                    add((char) lastChar, (char) lastChar);
+                    add(lastChar, lastChar);
                    if (nestedPatDone) {
                        // If there was a character before the nested set,
                        // then we need to insert it in newPat before the
@ -1416,10 +1420,11 @@ public class UnicodeSet extends UnicodeFilter {
            } else {
                if (lastChar != NONE) {
                    // We have <char><char>
-                    add((char) lastChar, (char) lastChar);
+                    add(lastChar, lastChar);
                    _appendToPat(newPat, lastChar, false);
                }
                lastChar = c;
+                isLastLiteral = isLiteral;
            }
        }

@ -1430,9 +1435,9 @@ public class UnicodeSet extends UnicodeFilter {
        // Treat a trailing '$' as indicating ETHER.  This code is only
        // executed if symbols == NULL; otherwise other code parses the
        // anchor.
-        if (lastChar == SymbolTable.SYMBOL_REF) {
+        if (lastChar == SymbolTable.SYMBOL_REF && !isLastLiteral) {
            rebuildPattern = true;
-            newPat.append(lastChar);
+            newPat.append((char) lastChar);
            add(TransliterationRule.ETHER);
        }
        
--- a/icu4j/src/com/ibm/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/text/UnicodeSet.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeSet.java,v $
- * $Date: 2001/11/29 22:31:18 $
- * $Revision: 1.48 $
+ * $Date: 2001/12/01 01:31:18 $
+ * $Revision: 1.49 $
 *
 *****************************************************************************************
 */
@ -204,7 +204,7 @@ import com.ibm.util.Utility;
 * Unicode property
 * </table>
 * @author Alan Liu
- * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.48 $ $Date: 2001/11/29 22:31:18 $
+ * @version $RCSfile: UnicodeSet.java,v $ $Revision: 1.49 $ $Date: 2001/12/01 01:31:18 $
 */
 public class UnicodeSet extends UnicodeFilter {

@ -443,6 +443,8 @@ public class UnicodeSet extends UnicodeFilter {
        case '\\': //BACKSLASH:
        case '{':
        case '}':
+        case '$':
+        case ':':
            buf.append('\\');
            break;
        default:
@ -475,8 +477,9 @@ public class UnicodeSet extends UnicodeFilter {
        if (pat != null) {
            int i;
            int backslashCount = 0;
-            for (i=0; i<pat.length(); ++i) {
-                char c = pat.charAt(i);
+            for (i=0; i<pat.length(); ) {
+                int c = UTF16.charAt(pat, i);
+                i += UTF16.getCharCount(c);
                if (escapeUnprintable && Utility.isUnprintable(c)) {
                    // If the unprintable character is preceded by an odd
                    // number of backslashes, then it has been escaped.
@ -488,7 +491,7 @@ public class UnicodeSet extends UnicodeFilter {
                    Utility.escapeUnprintable(result, c);
                    backslashCount = 0;
                } else {
-                    result.append(c);
+                    UTF16.append(result, c);
                    if (c == '\\') {
                        ++backslashCount;
                    } else {
@ -706,7 +709,7 @@ public class UnicodeSet extends UnicodeFilter {
                int start = list[i++];
                int count = list[i++] - start;
                if (index < count) {
-                    return (char)(start + index);
+                    return start + index;
                }
                index -= count;
            }
@ -1114,6 +1117,7 @@ public class UnicodeSet extends UnicodeFilter {

        final int NONE = -1;
        int lastChar = NONE; // This is either a char (0..10FFFF) or -1
+        boolean isLastLiteral = false; // TRUE if lastChar was a literal
        char lastOp = 0;

        /* This loop iterates over the characters in the pattern.  We start at
@ -1343,7 +1347,7 @@ public class UnicodeSet extends UnicodeFilter {
                    if (lastOp != 0) {
                        throw new IllegalArgumentException("Illegal rhs for " + lastChar + lastOp);
                    }
-                    add((char) lastChar, (char) lastChar);
+                    add(lastChar, lastChar);
                    if (nestedPatDone) {
                        // If there was a character before the nested set,
                        // then we need to insert it in newPat before the
@ -1416,10 +1420,11 @@ public class UnicodeSet extends UnicodeFilter {
            } else {
                if (lastChar != NONE) {
                    // We have <char><char>
-                    add((char) lastChar, (char) lastChar);
+                    add(lastChar, lastChar);
                    _appendToPat(newPat, lastChar, false);
                }
                lastChar = c;
+                isLastLiteral = isLiteral;
            }
        }

@ -1430,9 +1435,9 @@ public class UnicodeSet extends UnicodeFilter {
        // Treat a trailing '$' as indicating ETHER.  This code is only
        // executed if symbols == NULL; otherwise other code parses the
        // anchor.
-        if (lastChar == SymbolTable.SYMBOL_REF) {
+        if (lastChar == SymbolTable.SYMBOL_REF && !isLastLiteral) {
            rebuildPattern = true;
-            newPat.append(lastChar);
+            newPat.append((char) lastChar);
            add(TransliterationRule.ETHER);
        }