ICU-1232 disallow UnicodeSets (and other standins) in translit output

X-SVN-Rev: 6702
2025-04-10 07:39:16 +00:00 · 2001-11-09 01:22:33 +00:00 · 2001-11-09 01:22:33 +00:00 · 51f23530aa
commit 51f23530aa
parent 9a9e9fc74e
6 changed files with 67 additions and 8 deletions
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -93,7 +93,7 @@ public:

    virtual const UnicodeString* lookup(const UnicodeString& s) const;

-    virtual const UnicodeSet* lookupSet(UChar32 ch) const;
+    virtual const UnicodeMatcher* lookupMatcher(UChar32 ch) const;

    virtual UnicodeString parseReference(const UnicodeString& text,
                                         ParsePosition& pos, int32_t limit) const;
@ -113,15 +113,15 @@ const UnicodeString* ParseData::lookup(const UnicodeString& name) const {
 /**
 * Implement SymbolTable API.
 */
-const UnicodeSet* ParseData::lookupSet(UChar32 ch) const {
+const UnicodeMatcher* ParseData::lookupMatcher(UChar32 ch) const {
    // Note that we cannot use data.lookupSet() because the
    // set array has not been constructed yet.
-    const UnicodeSet* set = NULL;
+    const UnicodeMatcher* set = NULL;
    int32_t i = ch - data->variablesBase;
    if (i >= 0 && i < variablesVector->size()) {
        int32_t i = ch - data->variablesBase;
        set = (i < variablesVector->size()) ?
-            (UnicodeSet*) variablesVector->elementAt(i) : 0;
+            (UnicodeMatcher*) variablesVector->elementAt(i) : 0;
    }
    return set;
 }
@ -1137,7 +1137,8 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
        // - allow arbitrary cursor offsets and do runtime checking.
        //(right->cursorOffset > (left->text.length() - left->post)) ||
        //(-right->cursorOffset > left->ante) ||
-        right->anchorStart || right->anchorEnd) {
+        right->anchorStart || right->anchorEnd ||
+        !isValidOutput(right->text)) {

        return syntaxError(U_MALFORMED_RULE, rule, start);
    }
@ -1161,6 +1162,21 @@ int32_t TransliteratorParser::parseRule(const UnicodeString& rule, int32_t pos,
    return pos;
 }

+/**
+ * Return true if the given string looks like valid output, that is,
+ * does not contain quantifiers or other special input-only elements.
+ */
+UBool TransliteratorParser::isValidOutput(const UnicodeString& output) const {
+    for (int32_t i=0; i<output.length(); ++i) {
+        UChar32 c = output.char32At(i);
+        i += UTF_CHAR_LENGTH(c);
+        if (parseData->lookupMatcher(c) != NULL) {
+            return FALSE;
+        }
+    }
+    return TRUE;
+}
+
 /**
 * Called by main parser upon syntax error.  Search the rule string
 * for the probable end of the rule.  Of course, if the error is that
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -210,6 +210,12 @@ private:
     */
    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit);

+    /**
+     * Return true if the given string looks like valid output, that is,
+     * does not contain quantifiers or other special input-only elements.
+     */
+    UBool isValidOutput(const UnicodeString& output) const;
+
    /**
     * Called by main parser upon syntax error.  Search the rule string
     * for the probable end of the rule.  Of course, if the error is that
--- a/icu4c/source/i18n/symtable.h
+++ b/icu4c/source/i18n/symtable.h
@ -44,10 +44,10 @@ public:
    virtual const UnicodeString* lookup(const UnicodeString& s) const = 0;

    /**
-     * Lookup the UnicodeSet associated with the given character, and
+     * Lookup the UnicodeMatcher associated with the given character, and
     * return it.  Return <tt>null</tt> if not found.
     */
-    virtual const UnicodeSet* lookupSet(UChar32 ch) const = 0;
+    virtual const UnicodeMatcher* lookupMatcher(UChar32 ch) const = 0;

    /**
     * Parse a symbol reference name from the given string, starting
--- a/icu4c/source/i18n/uniset.cpp
+++ b/icu4c/source/i18n/uniset.cpp
@ -106,6 +106,13 @@ const UChar32 UnicodeSet::MIN_VALUE = UNICODESET_LOW;
 */
 const UChar32 UnicodeSet::MAX_VALUE = UNICODESET_HIGH - 1;

+// HEY WHAT'S THIS DOING HERE?
+// This is here until we have sufficient reason to add an entire
+// separate unimatch.cpp source file just for one line.
+const char UnicodeMatcher::fgClassID = 0;
+
+const char UnicodeSet::fgClassID = 0;
+
 //----------------------------------------------------------------
 // Constructors &c
 //----------------------------------------------------------------
@ -978,7 +985,12 @@ void UnicodeSet::_applyPattern(const UnicodeString& pattern,
            if (ivarValueBuffer < varValueBuffer->length()) {
                c = varValueBuffer->char32At(ivarValueBuffer);
                ivarValueBuffer += UTF_CHAR_LENGTH(c);
-                nestedSet = symbols->lookupSet(c); // may be NULL
+                const UnicodeMatcher *m = symbols->lookupMatcher(c); // may be NULL
+                if (m != NULL && m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
+                    status = U_ILLEGAL_ARGUMENT_ERROR;
+                    return;
+                }
+                nestedSet = (UnicodeSet*) m;
                nestedPatDone = FALSE;
            } else {
                varValueBuffer = NULL;
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -135,6 +135,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
        TESTCASE(53,TestTitleAccents);
        TESTCASE(54,TestLocaleResource);
        TESTCASE(55,TestParseError);
+        TESTCASE(56,TestOutputSet);
        default: name = ""; break;
    }
 }
@ -2612,6 +2613,7 @@ void TransliteratorTest::TestParseError() {
    UErrorCode ec = U_ZERO_ERROR;
    UParseError pe;
    Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
+    delete t;
    if (U_FAILURE(ec)) {
        UnicodeString err(pe.preContext);
        err.append((UChar)124/*|*/).append(pe.postContext);
@ -2625,6 +2627,24 @@ void TransliteratorTest::TestParseError() {
    errln("FAIL: no syntax error");
 }

+/**
+ * Make sure sets on output are disallowed.
+ */
+void TransliteratorTest::TestOutputSet() {
+    UnicodeString rule = "$set = [a-cm-n]; b > $set;";
+    UErrorCode ec = U_ZERO_ERROR;
+    UParseError pe;
+    Transliterator *t = Transliterator::createFromRules("ID", rule, UTRANS_FORWARD, pe, ec);
+    delete t;
+    if (U_FAILURE(ec)) {
+        UnicodeString err(pe.preContext);
+        err.append((UChar)124/*|*/).append(pe.postContext);
+        logln("Ok: " + err);
+        return;
+    }
+    errln("FAIL: No syntax error");
+}        
+
 //======================================================================
 // icu4c ONLY
 // These tests are not mirrored (yet) in icu4j at
--- a/icu4c/source/test/intltest/transtst.h
+++ b/icu4c/source/test/intltest/transtst.h
@ -261,6 +261,11 @@ class TransliteratorTest : public IntlTest {
     */
    void TestParseError();

+    /**
+     * Make sure sets on output are disallowed.
+     */
+    void TestOutputSet();
+
    //======================================================================
    // Support methods
    //======================================================================