ICU-329 parse engines need better error reporting

X-SVN-Rev: 958
2025-04-13 08:53:20 +00:00 · 2000-03-18 01:42:45 +00:00 · 2000-03-18 01:42:45 +00:00 · af7124308c
commit af7124308c
parent 24bb0f4fce
7 changed files with 145 additions and 69 deletions
--- a/icu4c/source/i18n/i18n.dsp
+++ b/icu4c/source/i18n/i18n.dsp
@ -868,6 +868,25 @@ InputPath=.\unicode\numfmt.h
 # End Source File
 # Begin Source File

+SOURCE=.\unicode\parseerr.h
+
+!IF  "$(CFG)" == "i18n - Win32 Release"
+
+!ELSEIF  "$(CFG)" == "i18n - Win32 Debug"
+
+# Begin Custom Build
+InputPath=.\unicode\parseerr.h
+
+"..\..\include\unicode\parseerr.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
+	copy   unicode\parseerr.h    ..\..\include\unicode
+
+# End Custom Build
+
+!ENDIF 
+
+# End Source File
+# Begin Source File
+
 SOURCE=.\unicode\parsepos.h

 !IF  "$(CFG)" == "i18n - Win32 Release"
--- a/icu4c/source/i18n/rbt.cpp
+++ b/icu4c/source/i18n/rbt.cpp
@ -15,11 +15,12 @@

 void RuleBasedTransliterator::_construct(const UnicodeString& rules,
                                         Direction direction,
-                                         UErrorCode& status) {
+                                         UErrorCode& status,
+                                         ParseError* parseError) {
    data = 0;
    isDataOwned = TRUE;
    if (U_SUCCESS(status)) {
-        data = TransliterationRuleParser::parse(rules, direction);
+        data = TransliterationRuleParser::parse(rules, direction, parseError);
        if (data == 0) {
            status = U_ILLEGAL_ARGUMENT_ERROR;
        } else {
--- a/icu4c/source/i18n/rbt_pars.cpp
+++ b/icu4c/source/i18n/rbt_pars.cpp
@ -16,6 +16,7 @@
 #include "cstring.h"
 #include "unicode/parsepos.h"
 #include "symtable.h"
+#include "unicode/parseerr.h"

 // Operators
 const UChar TransliterationRuleParser::VARIABLE_DEF_OP = 0x003D/*=*/;
@ -91,8 +92,9 @@ void ParseData::lookup(const UnicodeString& name, UChar& c, UnicodeSet*& set,

 TransliterationRuleData*
 TransliterationRuleParser::parse(const UnicodeString& rules,
-                                 RuleBasedTransliterator::Direction direction) {
-    TransliterationRuleParser parser(rules, direction);
+                                 RuleBasedTransliterator::Direction direction,
+                                 ParseError* parseError) {
+    TransliterationRuleParser parser(rules, direction, parseError);
    parser.parseRules();
    if (U_FAILURE(parser.status)) {
        delete parser.data;
@ -108,8 +110,9 @@ TransliterationRuleParser::parse(const UnicodeString& rules,
 */
 TransliterationRuleParser::TransliterationRuleParser(
                                     const UnicodeString& theRules,
-                                     RuleBasedTransliterator::Direction theDirection) :
-    rules(theRules), direction(theDirection), data(0) {
+                                     RuleBasedTransliterator::Direction theDirection,
+                                     ParseError* theParseError) :
+    rules(theRules), direction(theDirection), data(0), parseError(theParseError) {
    parseData = new ParseData(0, &setVariablesVector);
 }

@ -139,6 +142,9 @@ void TransliterationRuleParser::parseRules(void) {

    parseData->data = data;
    setVariablesVector.removeAllElements();
+    if (parseError != 0) {
+        parseError->code = 0;
+    }
    determineVariableRange();

    int32_t pos = 0;
@ -225,19 +231,19 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
        // Handle escapes
        if (c == ESCAPE) {
            if (pos == limit) {
-                return syntaxError("Trailing backslash", rules, start);
+                return syntaxError(RuleBasedTransliterator::TRAILING_BACKSLASH, rules, start);
            }
            // Parse \uXXXX escapes
            c = rules.charAt(pos++);
            if (c == 0x0075/*u*/) {
                if ((pos+4) > limit) {
-                    return syntaxError("Malformed Unicode escape", rules, start);
+                    return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
                }
                c = (UChar)0x0000;
                for (int32_t plim=pos+4; pos<plim; ++pos) { // [sic]
                    int32_t digit = Unicode::digit(rules.charAt(pos), 16);
                    if (digit<0) {
-                        return syntaxError("Malformed Unicode escape", rules, start);
+                        return syntaxError(RuleBasedTransliterator::MALFORMED_UNICODE_ESCAPE, rules, start);
                    }
                    c = (UChar) ((c << 4) | digit);
                }
@ -261,7 +267,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
                 */
                for (;;) {
                    if (iq < 0) {
-                        return syntaxError("Unterminated quote", rules, start);
+                        return syntaxError(RuleBasedTransliterator::UNTERMINATED_QUOTE, rules, start);
                    }
                    scratch.truncate(0);
                    rules.extractBetween(pos, iq, scratch);
@ -280,7 +286,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
        }
        if (OPERATORS.indexOf(c) >= 0) {
            if (op != 0) {
-                return syntaxError("Unquoted special", rules, start);
+                return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
            }
            // Found an operator char.  Check for forward-reverse operator.
            if (c == REVERSE_RULE_OP &&
@ -308,21 +314,21 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
            {
                int32_t j = rules.indexOf(VARIABLE_REF_CLOSE, pos);
                if (pos == j || j < 0) { // empty or unterminated
-                    return syntaxError("Malformed variable reference", rules, start);
+                    return syntaxError(RuleBasedTransliterator::MALFORMED_VARIABLE_REFERENCE, rules, start);
                }
                scratch.truncate(0);
                rules.extractBetween(pos, j, scratch);
                pos = j+1;
                UChar v = data->lookupVariable(scratch, status);
                if (U_FAILURE(status)) {
-                    return syntaxError("Undefined variable", rules, start);
+                    return syntaxError(RuleBasedTransliterator::UNDEFINED_VARIABLE, rules, start);
                }
                buf.append(v);
            }
            break;
        case CONTEXT_OPEN:
            if (post >= 0) {
-                return syntaxError("Multiple post contexts", rules, start);
+                return syntaxError(RuleBasedTransliterator::MULTIPLE_POST_CONTEXTS, rules, start);
            }
            // Ignore CONTEXT_OPEN if buffer length is zero -- that means
            // this is the optional opening delimiter for the ante context.
@ -332,14 +338,14 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
            break;
        case CONTEXT_CLOSE:
            if (postClose >= 0) {
-                return syntaxError("Unexpected ')'", rules, start);
+                return syntaxError(RuleBasedTransliterator::UNEXPECTED_CLOSE_CONTEXT, rules, start);
            }
            if (post >= 0) {
                // This is probably the optional closing delimiter
                // for the post context; save the pos and check later.
                postClose = buf.length();
            } else if (ante >= 0) {
-                return syntaxError("Multiple ante contexts", rules, start);
+                return syntaxError(RuleBasedTransliterator::MULTIPLE_ANTE_CONTEXTS, rules, start);
            } else {
                ante = buf.length();
            }
@ -348,16 +354,16 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
            ParsePosition pp(pos-1); // Backup to opening '['
            buf.append(registerSet(new UnicodeSet(rules, pp, *parseData, status)));
            if (U_FAILURE(status)) {
-                return syntaxError("Invalid set", rules, start);
+                return syntaxError(RuleBasedTransliterator::MALFORMED_SET, rules, start);
            }
            pos = pp.getIndex(); }
            break;
        case VARIABLE_REF_CLOSE:
        case SET_CLOSE:
-            return syntaxError("Unquoted special", rules, start);
+            return syntaxError(RuleBasedTransliterator::UNQUOTED_SPECIAL, rules, start);
        case CURSOR_POS:
            if (cursor >= 0) {
-                return syntaxError("Multiple cursors", rules, start);
+                return syntaxError(RuleBasedTransliterator::MULTIPLE_CURSORS, rules, start);
            }
            cursor = buf.length();
            break;
@ -367,13 +373,13 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
        }
    }
    if (op == 0) {
-        return syntaxError("No operator", rules, start);
+        return syntaxError(RuleBasedTransliterator::MISSING_OPERATOR, rules, start);
    }

    // Check context close parameters
    if ((leftPostClose >= 0 && leftPostClose != left.length()) ||
        (postClose >= 0 && postClose != buf.length())) {
-        return syntaxError("Extra text after ]", rules, start);
+        return syntaxError(RuleBasedTransliterator::TEXT_AFTER_CLOSE_CONTEXT, rules, start);
    }

    // Context is only allowed on the input side; that is, the left side
@ -388,10 +394,10 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
        // character, it is either a multi-character string, or multiple
        // sets, or a mixture of chars and sets -- syntax error.
        if (buf.length() != 1) {
-            return syntaxError("Malformed RHS", rules, start);
+            return syntaxError(RuleBasedTransliterator::MALFORMED_RHS, rules, start);
        }
        if (data->isVariableDefined(left)) {
-            return syntaxError("Duplicate definition", rules, start);
+            return syntaxError(RuleBasedTransliterator::DUPLICATE_VARIABLE_DEFINITION, rules, start);
        }
        data->defineVariable(left, buf.charAt(0), status);
        break;
@ -399,7 +405,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
    case FORWARD_RULE_OP:
        if (direction == RuleBasedTransliterator::FORWARD) {
            if (ante >= 0 || post >= 0 || leftCursor >= 0) {
-                return syntaxError("Malformed rule", rules, start);
+                return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
            }
            data->ruleSet.addRule(new TransliterationRule(
                                     left, leftAnte, leftPost,
@ -410,7 +416,7 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
    case REVERSE_RULE_OP:
        if (direction == RuleBasedTransliterator::REVERSE) {
            if (leftAnte >= 0 || leftPost >= 0 || cursor >= 0) {
-                return syntaxError("Malformed rule", rules, start);
+                return syntaxError(RuleBasedTransliterator::MALFORMED_RULE, rules, start);
            }
            data->ruleSet.addRule(new TransliterationRule(
                                     buf, ante, post,
@ -457,15 +463,19 @@ int32_t TransliterationRuleParser::parseRule(int32_t pos, int32_t limit) {
 * @param rule pattern string
 * @param start position of first character of current rule
 */
-int32_t TransliterationRuleParser::syntaxError(const char* /*msg*/,
-                                               const UnicodeString& /*rule*/,
+int32_t TransliterationRuleParser::syntaxError(int32_t parseErrorCode,
+                                               const UnicodeString& rule,
                                               int32_t start) {
-//|    int end = quotedIndexOf(rule, start, rule.length(), ";");
-//|    if (end < 0) {
-//|        end = rule.length();
-//|    }
-//|    throw new IllegalArgumentException(msg + " in " +
-//|                                       rule.substring(start, end));
+    if (parseError != 0) {
+        parseError->code = parseErrorCode;
+        parseError->line = 0; // We don't return a line #
+        parseError->offset = start; // Character offset from rule start
+        int32_t end = quotedIndexOf(rule, start, rule.length(), END_OF_RULE);
+        if (end < 0) {
+            end = rule.length();
+        }
+        rule.extractBetween(start, end, parseError->context); // Current rule
+    }
    status = U_ILLEGAL_ARGUMENT_ERROR;
    return start;
 }
@ -512,30 +522,21 @@ void TransliterationRuleParser::determineVariableRange(void) {
 }

 /**
- * Returns the index of the first character in a set, ignoring quoted text.
+ * Returns the index of a character, ignoring quoted text.
 * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
- * found by a search for "h".  Unlike String.indexOf(), this method searches
- * not for a single character, but for any character of the string
- * <code>setOfChars</code>.
- * @param text text to be searched
- * @param start the beginning index, inclusive; <code>0 <= start
- * <= limit</code>.
- * @param limit the ending index, exclusive; <code>start <= limit
- * <= text.length()</code>.
- * @param setOfChars string with one or more distinct characters
- * @return Offset of the first character in <code>setOfChars</code>
- * found, or -1 if not found.
- * @see #indexOf
+ * found by a search for 'h'.
 */
 int32_t TransliterationRuleParser::quotedIndexOf(const UnicodeString& text,
                                                 int32_t start, int32_t limit,
-                                                 const UnicodeString& setOfChars) {
+                                                 UChar charToFind) {
    for (int32_t i=start; i<limit; ++i) {
        UChar c = text.charAt(i);
-        if (c == QUOTE) {
+        if (c == ESCAPE) {
+            ++i;
+        } else if (c == QUOTE) {
            while (++i < limit
                   && text.charAt(i) != QUOTE) {}
-        } else if (setOfChars.indexOf(c) >= 0) {
+        } else if (c == charToFind) {
            return i;
        }
    }
--- a/icu4c/source/i18n/rbt_pars.h
+++ b/icu4c/source/i18n/rbt_pars.h
@ -10,6 +10,7 @@

 #include "unicode/rbt.h"
 #include "uvector.h"
+#include "unicode/parseerr.h"

 class TransliterationRuleData;
 class UnicodeSet;
@ -33,6 +34,12 @@ class TransliterationRuleParser {
     */
    UErrorCode status;

+    /**
+     * Pointer to user structure in which to return parse error information.
+     * May be NULL.
+     */
+    ParseError* parseError;
+
    /**
     * Temporary symbol table used during parsing.
     */
@ -84,7 +91,8 @@ public:

    static TransliterationRuleData*
        parse(const UnicodeString& rules,
-              RuleBasedTransliterator::Direction direction);
+              RuleBasedTransliterator::Direction direction,
+              ParseError* parseError = 0);
    
 private:

@ -94,7 +102,8 @@ private:
     * rules
     */
    TransliterationRuleParser(const UnicodeString& rules,
-                              RuleBasedTransliterator::Direction direction);
+                              RuleBasedTransliterator::Direction direction,
+                              ParseError* parseError = 0);

    /**
     * Destructor.
@ -135,7 +144,7 @@ private:
     * @param rule pattern string
     * @param start position of first character of current rule
     */
-    int32_t syntaxError(const char* msg, const UnicodeString&, int32_t start);
+    int32_t syntaxError(int32_t parseErrorCode, const UnicodeString&, int32_t start);

    /**
     * Allocate a private-use substitution character for the given set,
@ -155,24 +164,20 @@ private:
    void determineVariableRange(void);

    /**
-     * Returns the index of the first character in a set, ignoring quoted text.
+     * Returns the index of a character, ignoring quoted text.
     * For example, in the string "abc'hide'h", the 'h' in "hide" will not be
-     * found by a search for "h".  Unlike String.indexOf(), this method searches
-     * not for a single character, but for any character of the string
-     * <code>setOfChars</code>.
+     * found by a search for 'h'.
     * @param text text to be searched
     * @param start the beginning index, inclusive; <code>0 <= start
     * <= limit</code>.
     * @param limit the ending index, exclusive; <code>start <= limit
     * <= text.length()</code>.
-     * @param setOfChars string with one or more distinct characters
-     * @return Offset of the first character in <code>setOfChars</code>
-     * found, or -1 if not found.
-     * @see #indexOf
+     * @param c character to search for
+     * @return Offset of the first instance of c, or -1 if not found.
     */
    static int32_t quotedIndexOf(const UnicodeString& text,
                                 int32_t start, int32_t limit,
-                                 const UnicodeString& setOfChars);
+                                 UChar c);
 };

 #endif
--- a/icu4c/source/i18n/translit.cpp
+++ b/icu4c/source/i18n/translit.cpp
@ -534,7 +534,8 @@ Transliterator* Transliterator::createInverse(void) const {
 * @see #getID
 */
 Transliterator* Transliterator::createInstance(const UnicodeString& ID,
-                                               Transliterator::Direction dir) {
+                                               Transliterator::Direction dir,
+                                               ParseError* parseError) {
    if (ID.indexOf(ID_DELIM) >= 0) {
        return new CompoundTransliterator(ID, dir, 0);
    }
@ -546,10 +547,10 @@ Transliterator* Transliterator::createInstance(const UnicodeString& ID,
            ID.extractBetween(i+1, ID.length(), inverseID);
            ID.extractBetween(0, i, right);
            inverseID.append(ID_SEP).append(right);
-            t = _createInstance(inverseID);
+            t = _createInstance(inverseID, parseError);
        }
    } else {
-        t = _createInstance(ID);
+        t = _createInstance(ID, parseError);
    }
    return t;
 }
@ -607,7 +608,8 @@ inline int32_t Transliterator::hash(const UnicodeString& str) {
 * Returns a transliterator object given its ID.  Unlike getInstance(),
 * this method returns null if it cannot make use of the given ID.
 */
-Transliterator* Transliterator::_createInstance(const UnicodeString& ID) {
+Transliterator* Transliterator::_createInstance(const UnicodeString& ID,
+                                                ParseError* parseError) {
    UErrorCode status = U_ZERO_ERROR;

    if (!cacheInitialized) {
@ -660,8 +662,9 @@ Transliterator* Transliterator::_createInstance(const UnicodeString& ID) {

            data = TransliterationRuleParser::parse(*rules, isReverse
                                    ? RuleBasedTransliterator::REVERSE
-                                    : RuleBasedTransliterator::FORWARD);
-            
+                                    : RuleBasedTransliterator::FORWARD,
+                                    parseError);
+
            // Double check to see if someone has modified the entry
            // since we last looked at it.
            if (entry->entryType != CacheEntry::RBT_DATA) {
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -51,6 +51,7 @@ TransliteratorTest::runIndexedTest(int32_t index, bool_t exec,
        CASE(11,TestPatternQuoting);
        CASE(12,TestJ277);
        CASE(13,TestJ243);
+        CASE(14,TestJ329);
        default: name = ""; break;
    }
 }
@ -65,11 +66,15 @@ void TransliteratorTest::TestInstantiation() {
                  i + ") returned empty string");
            continue;
        }
-        Transliterator* t = Transliterator::createInstance(id);
+        ParseError parseError;
+        Transliterator* t = Transliterator::createInstance(id,
+                              Transliterator::FORWARD, &parseError);
        name.truncate(0);
        Transliterator::getDisplayName(id, name);
        if (t == 0) {
-            errln(UnicodeString("FAIL: Couldn't create ") + id);
+            errln(UnicodeString("FAIL: Couldn't create ") + id +
+                  ", parse error " + parseError.code + ", line " +
+                  parseError.line + ", offset " + parseError.offset);
            // When createInstance fails, it deletes the failing
            // entry from the available ID list.  We detect this
            // here by looking for a change in countAvailableIDs.
@ -577,6 +582,43 @@ void TransliteratorTest::TestJ243(void) {
    expect(hex3, "012", "&#x30;&#x31;&#x32;");
 }

+/**
+ * Parsers need better syntax error messages.
+ */
+void TransliteratorTest::TestJ329(void) {
+    
+    struct { bool_t containsErrors; const char* rule; } DATA[] = {
+        { FALSE, "a > b; c > d" },
+        { TRUE,  "a > b; no operator; c > d" },
+    };
+    int32_t DATA_length = sizeof(DATA) / sizeof(DATA[0]);
+
+    for (int32_t i=0; i<DATA_length; ++i) {
+        UErrorCode status = U_ZERO_ERROR;
+        ParseError parseError;
+        RuleBasedTransliterator rbt("<ID>",
+                                    DATA[i].rule,
+                                    Transliterator::FORWARD,
+                                    0,
+                                    parseError,
+                                    status);
+        bool_t gotError = U_FAILURE(status);
+        UnicodeString desc(DATA[i].rule);
+        desc.append(gotError ? " -> error" : " -> no error");
+        if (gotError) {
+            desc = desc + ", ParseError code=" + parseError.code +
+                " line=" + parseError.line +
+                " offset=" + parseError.offset +
+                " context=" + parseError.context;
+        }
+        if (gotError == DATA[i].containsErrors) {
+            logln(UnicodeString("Ok:   ") + desc);
+        } else {
+            errln(UnicodeString("FAIL: ") + desc);
+        }
+    }
+}
+
 //======================================================================
 // Support methods
 //======================================================================
--- a/icu4c/source/test/intltest/transtst.h
+++ b/icu4c/source/test/intltest/transtst.h
@ -91,6 +91,11 @@ class TransliteratorTest : public IntlTest {
     */
    void TestJ243(void);

+    /**
+     * Parsers need better syntax error messages.
+     */
+    void TestJ329(void);
+
    //======================================================================
    // Support methods
    //======================================================================