ICU-2874 fix parsing of [^-b]

X-SVN-Rev: 11873
2025-04-07 22:44:49 +00:00 · 2003-05-09 21:26:52 +00:00 · 2003-05-09 21:26:52 +00:00 · 9a9e49c403
commit 9a9e49c403
parent 0f04c4bf54
3 changed files with 37 additions and 7 deletions
--- a/icu4c/source/common/uniset.cpp
+++ b/icu4c/source/common/uniset.cpp
@ -1879,7 +1879,8 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,

    // mode 0: No chars parsed yet; next must be '['
    // mode 1: '[' seen; if next is '^' or ':' then special
-    // mode 2: '[' '^'? seen; parse pattern and close with ']'
+    // mode 15: "[^" seen; if next is '-' then literal
+    // mode 2: '[' '^'? '-'? seen; parse pattern and close with ']'
    // mode 3: '[:' seen; parse category and close with ':]'
    // mode 4: ']' seen; parse complete
    // mode 5: Top-level property pattern seen
@ -1958,14 +1959,16 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
            case COMPLEMENT:
                invert = TRUE;
                newPat.append(c);
+                mode = 15;
                continue; // Back to top to fetch next character
            case HYPHEN:
                isLiteral = TRUE; // Treat leading '-' as a literal
                break; // Fall through
            }
            break;
-        case 2:
-            if (c == HYPHEN && invert) {
+        case 15:
+            mode = 2;
+            if (c == HYPHEN) {
                isLiteral = TRUE; // [^-...] starts with literal '-'
            }
            break;
--- a/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/UnicodeSetTest.java,v $ 
- * $Date: 2003/02/27 18:35:54 $ 
- * $Revision: 1.47 $
+ * $Date: 2003/05/09 21:25:25 $ 
+ * $Revision: 1.48 $
 *
 *****************************************************************************************
 */
@ -836,6 +836,24 @@ public class UnicodeSetTest extends TestFmwk {
            "[:Case Sensitive:]",
            "A\u1FFC\\U00010410",
            ";\u00B4\\U00010500",
+
+
+            // Regex compatibility test
+            "[-b]", // leading '-' is literal
+            "-b",
+            "ac",
+            
+            "[^-b]", // leading '-' is literal
+            "ac",
+            "-b",
+            
+            "[b-]", // trailing '-' is literal
+            "-b",
+            "ac",
+            
+            "[^b-]", // trailing '-' is literal
+            "ac",
+            "-b",
        };

        for (int i=0; i<DATA.length; i+=3) {  
--- a/icu4j/src/com/ibm/icu/text/UnicodeSet.java
+++ b/icu4j/src/com/ibm/icu/text/UnicodeSet.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeSet.java,v $
- * $Date: 2003/04/09 23:01:03 $
- * $Revision: 1.94 $
+ * $Date: 2003/05/09 21:25:25 $
+ * $Revision: 1.95 $
 *
 *****************************************************************************************
 */
@ -2007,6 +2007,7 @@ public class UnicodeSet extends UnicodeFilter {

        // mode 0: No chars parsed yet; next must be '['
        // mode 1: '[' seen; if next is '^' or ':' then special
+        // mode 15: "[^" seen; if next is '-' then literal
        // mode 2: '[' '^'? seen; parse pattern and close with ']'
        // mode 3: '[:' seen; parse category and close with ':]'
        // mode 4: ']' seen; parse complete
@ -2082,11 +2083,19 @@ public class UnicodeSet extends UnicodeFilter {
                case '^':
                    invert = true;
                    newPat.append((char) c);
+                    mode = 15;
                    continue; // Back to top to fetch next character
                case '-':
                    isLiteral = true; // Treat leading '-' as a literal
                    break; // Fall through
                }
+                break;
+            case 15:
+                mode = 2;
+                if (c == '-') {
+                    isLiteral = true; // [^-...] starts with literal '-'
+                }
+                break;
                // else fall through and parse this character normally
            }